diff --git a/.agents/skills/python-kwargs-setattr-security/SKILL.md b/.agents/skills/python-kwargs-setattr-security/SKILL.md new file mode 100644 index 0000000000000..d31d9d9cac3fa --- /dev/null +++ b/.agents/skills/python-kwargs-setattr-security/SKILL.md @@ -0,0 +1,59 @@ +--- +name: python-kwargs-setattr-security +description: When reviewing or fixing Python code that uses setattr() with user-controlled kwargs to configure C++ extension objects (SessionOptions, RunOptions, etc.) in ONNX Runtime. Use this to apply the allowlist pattern that prevents arbitrary file writes and other attacks via reflected property access. +--- + +## Problem Pattern + +Using `hasattr(obj, k) / setattr(obj, k, v)` with user-controlled kwargs is insecure. The `hasattr` check is NOT a security guard — it returns True for ALL exposed properties including dangerous ones. + +```python +# INSECURE — do not use +for k, v in kwargs.items(): + if hasattr(options, k): + setattr(options, k, v) +``` + +## Fix: Explicit Allowlist + +Define a module-level frozenset of safe attribute names. Raise RuntimeError for known-but-blocked attrs; silently ignore unknown keys. + +```python +# Define at module level, before the class +_ALLOWED_SESSION_OPTIONS = frozenset({ + "enable_cpu_mem_arena", + "enable_mem_pattern", + # ... only explicitly reviewed safe attrs +}) + +# In the method +for k, v in kwargs.items(): + if k in _ALLOWED_SESSION_OPTIONS: + setattr(options, k, v) + elif hasattr(options, k): # reuse the existing instance, don't create new + raise RuntimeError( + f"SessionOptions attribute '{k}' is not permitted via the backend API. " + f"Allowed attributes: {', '.join(sorted(_ALLOWED_SESSION_OPTIONS))}" + ) + # else: silently ignore (may be kwargs for a different config object) +``` + +## Key Rules + +1. **Use the existing object** in `hasattr(options, k)` — never `hasattr(ClassName(), k)` (creates throwaway C++ objects per iteration) +2. **RuntimeError** is the ORT convention for API misuse errors (not ValueError) +3. **Silent ignore for one path is OK when kwargs are forwarded to both paths**: `run_model()` passes the same kwargs dict to both `prepare()` (validates SessionOptions) and `rep.run()` (validates RunOptions). A RunOptions kwarg unknown to SessionOptions is silently ignored by `prepare()` — this is correct because `rep.run()` will validate it. Only raise RuntimeError when the attr exists on the target object but is blocked. +4. **Frozenset constant naming**: `_ALLOWED_` — ALL_CAPS, Google Style +5. **No type annotations** on module-level constants (ORT Python convention) + +## Dangerous SessionOptions Properties (never allowlist) + +- `optimized_model_filepath` — triggers Model::Save(), overwrites arbitrary files +- `profile_file_prefix` + `enable_profiling` — writes profiling JSON to arbitrary path +- `register_custom_ops_library` — loads arbitrary shared libraries (method, not property) + +## Files in ONNX Runtime + +- `onnxruntime/python/backend/backend.py` — `_ALLOWED_SESSION_OPTIONS` +- `onnxruntime/python/backend/backend_rep.py` — `_ALLOWED_RUN_OPTIONS` +- Tests: `onnxruntime/test/python/onnxruntime_test_python_backend.py` — `TestBackendKwargsAllowlist` diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml index 8ff7a7071a755..52581c7d0a5f5 100644 --- a/.github/workflows/windows_openvino.yml +++ b/.github/workflows/windows_openvino.yml @@ -51,12 +51,12 @@ jobs: with: architecture: x64 - - name: Download OpenVINO Toolkit v2025.4.1 + - name: Download OpenVINO Toolkit v2026.1.0 env: - OpenVINOVersion: 2025.4.1 + OpenVINOVersion: 2026.1.0 shell: pwsh run: | - $Url ="https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.4.1/windows/openvino_toolkit_windows_2025.4.1.20426.82bbf0292c5_x86_64.zip" + $Url ="https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.1/windows_vc_mt/openvino_toolkit_windows_vc_mt_2026.1.0.21367.63e31528c62_x86_64.zip" $OutputPath = "$env:RUNNER_TEMP\openvino.zip" $ExtractPath = "$env:RUNNER_TEMP\openvino-v$env:OpenVINOVersion" $TempExtractPath = "$env:RUNNER_TEMP\openvino_temp" @@ -99,7 +99,7 @@ jobs: shell: pwsh # Use $GITHUB_ENV to set the variable for subsequent steps run: | - $openVinoRootDir = Join-Path $env:RUNNER_TEMP "openvino-v2025.4.1" + $openVinoRootDir = Join-Path $env:RUNNER_TEMP "openvino-v2026.1.0" echo "OpenVINORootDir=$openVinoRootDir" >> $env:GITHUB_ENV - name: Print OpenVINORootDir after downloading OpenVINO diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 8af161b524bee..83d1751e55543 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -89,6 +89,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) option(onnxruntime_USE_DNNL "Build with DNNL support" OFF) option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF) option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF) +option(onnxruntime_USE_RVV "Build with RISC-V Vector support in MLAS" OFF) option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF) option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF) diff --git a/cmake/deps.txt b/cmake/deps.txt index 448e6fcb23f2f..fa37238bbb82e 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -51,7 +51,7 @@ pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145 re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 -cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v4.2.1.zip;5d2b21b10478556c5e209dd7229e298a5c9f0b02 +cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v4.4.2.zip;4b0bae4428b84370407c0a71778b13dc2eee5be1 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0c12f53da76d0c31b03b9f0f8ec8f3b4.zip;239063aee4946a9af147b473a4c3da78ba7413b4 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96 diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake index 83d8a156b630f..62187fd0ca63f 100644 --- a/cmake/external/cutlass.cmake +++ b/cmake/external/cutlass.cmake @@ -4,7 +4,7 @@ onnxruntime_fetchcontent_declare( URL ${DEP_URL_cutlass} URL_HASH SHA1=${DEP_SHA1_cutlass} EXCLUDE_FROM_ALL - PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.2.1.patch + PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.4.2.patch ) FetchContent_GetProperties(cutlass) diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 4afa074a0b254..be0abc980bda6 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -376,6 +376,18 @@ if (CPUINFO_SUPPORTED) ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/win_arm_fp16_detection_fallback.patch FIND_PACKAGE_ARGS NAMES cpuinfo ) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + message(STATUS "Applying sysfs fallback patch for cpuinfo on Linux") + onnxruntime_fetchcontent_declare( + pytorch_cpuinfo + URL ${DEP_URL_pytorch_cpuinfo} + URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo} + EXCLUDE_FROM_ALL + PATCH_COMMAND + # https://github.com/microsoft/onnxruntime/issues/10038 + ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/fix_missing_sysfs_fallback.patch + FIND_PACKAGE_ARGS NAMES cpuinfo + ) else() onnxruntime_fetchcontent_declare( pytorch_cpuinfo diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index bde73252449dc..0233254ad50ad 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -435,6 +435,8 @@ else() set(X86 TRUE) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$") set(X86_64 TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*") + set(RISCV64 TRUE) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*") set(LOONGARCH64 TRUE) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x$") @@ -903,6 +905,48 @@ endif() set(MLAS_SOURCE_IS_NOT_SET 0) endif() endif() + if(RISCV64 AND MLAS_SOURCE_IS_NOT_SET) + file(GLOB_RECURSE mlas_platform_srcs CONFIGURE_DEPENDS + "${MLAS_SRC_DIR}/scalar/*.cpp") + + if(onnxruntime_USE_RVV) + set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") + set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS} -march=rv64gcv -mabi=lp64d") + check_cxx_source_compiles(" + #include + #include + int main() { + size_t vl = __riscv_vsetvl_e32m1(4); + return static_cast(vl == 0); + }" + HAS_RISCV64_RVV + ) + set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}") + unset(OLD_CMAKE_REQUIRED_FLAGS) + + if(HAS_RISCV64_RVV) + list(APPEND mlas_platform_srcs + ${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp + ${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp + ${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp + ) + set_source_files_properties( + ${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp + ${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp + ${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp + PROPERTIES COMPILE_FLAGS "-march=rv64gcv -mabi=lp64d") + list(APPEND mlas_private_compile_definitions MLAS_USE_RVV=1) + else() + message( + WARNING + "onnxruntime_USE_RVV was requested, but the compiler does not support rv64gcv RVV intrinsics. Falling back to scalar MLAS kernels.") + endif() + endif() + + if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH) + set(MLAS_SOURCE_IS_NOT_SET 0) + endif() + endif() if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET) file(GLOB_RECURSE mlas_platform_srcs "${MLAS_SRC_DIR}/scalar/*.cpp") @@ -997,4 +1041,4 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD) endif() endif() -endif() \ No newline at end of file +endif() diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 4e5636572b94a..bd12b50b7af43 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1400,6 +1400,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) SET(MLAS_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench) file(GLOB_RECURSE MLAS_BENCH_SOURCE_FILES "${MLAS_BENCH_DIR}/*.cpp" "${MLAS_BENCH_DIR}/*.h") + list(FILTER MLAS_BENCH_SOURCE_FILES EXCLUDE REGEX "${MLAS_BENCH_DIR}/riscv64/.*") onnxruntime_add_executable(onnxruntime_mlas_benchmark ${MLAS_BENCH_SOURCE_FILES} ${ONNXRUNTIME_ROOT}/core/framework/error_code.cc) target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc) target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS}) @@ -1418,6 +1419,33 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) target_link_libraries(onnxruntime_mlas_benchmark PRIVATE cpuinfo) endif() set_target_properties(onnxruntime_mlas_benchmark PROPERTIES FOLDER "ONNXRuntimeTest") + + endif() + + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*") + set(MLAS_RISCV64_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench/riscv64) + + onnxruntime_add_executable( + onnxruntime_mlas_sgemm_riscv_bench + ${MLAS_RISCV64_BENCH_DIR}/sgemm_riscv_bench.cpp) + target_include_directories(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc) + target_link_libraries( + onnxruntime_mlas_sgemm_riscv_bench + PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS}) + target_compile_definitions(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${mlas_private_compile_definitions}) + set_target_properties(onnxruntime_mlas_sgemm_riscv_bench PROPERTIES FOLDER "ONNXRuntimeTest") + + onnxruntime_add_executable( + onnxruntime_mlas_softmax_riscv_compare + ${MLAS_RISCV64_BENCH_DIR}/softmax_rvv_compare.cpp) + target_include_directories( + onnxruntime_mlas_softmax_riscv_compare + PRIVATE ${ONNXRUNTIME_ROOT} ${ONNXRUNTIME_ROOT}/core/mlas/inc) + target_link_libraries( + onnxruntime_mlas_softmax_riscv_compare + PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS}) + target_compile_definitions(onnxruntime_mlas_softmax_riscv_compare PRIVATE ${mlas_private_compile_definitions}) + set_target_properties(onnxruntime_mlas_softmax_riscv_compare PROPERTIES FOLDER "ONNXRuntimeTest") endif() if(WIN32) diff --git a/cmake/patches/cpuinfo/fix_missing_sysfs_fallback.patch b/cmake/patches/cpuinfo/fix_missing_sysfs_fallback.patch new file mode 100644 index 0000000000000..005cd458fdd2b --- /dev/null +++ b/cmake/patches/cpuinfo/fix_missing_sysfs_fallback.patch @@ -0,0 +1,83 @@ +diff --git a/src/linux/processors.c b/src/linux/processors.c +index 47bee76..d0c5569 100644 +--- a/src/linux/processors.c ++++ b/src/linux/processors.c +@@ -2,0 +3 @@ ++#include +@@ -291,0 +293,22 @@ ++static uint32_t cpuinfo_linux_get_max_processor_from_sysconf( ++ uint32_t max_processors_count, ++ const char* processor_list_name) { ++ const long nproc = sysconf(_SC_NPROCESSORS_ONLN); ++ if (nproc <= 0) { ++ cpuinfo_log_warning( ++ "failed to query online processors from sysconf(_SC_NPROCESSORS_ONLN) for %s", ++ processor_list_name); ++ return UINT32_MAX; ++ } ++ ++ uint32_t max_processor = (uint32_t)(nproc - 1); ++ if ((uint64_t)nproc > (uint64_t)max_processors_count) { ++ cpuinfo_log_warning( ++ "online processors count %ld exceeds system limit %" PRIu32 ": truncating to the latter", ++ nproc, ++ max_processors_count); ++ max_processor = max_processors_count - 1; ++ } ++ return max_processor; ++} ++ +@@ -301 +324 @@ +- return UINT32_MAX; ++ return cpuinfo_linux_get_max_processor_from_sysconf(max_processors_count, POSSIBLE_CPULIST_FILENAME); +@@ -323 +346 @@ +- return UINT32_MAX; ++ return cpuinfo_linux_get_max_processor_from_sysconf(max_processors_count, PRESENT_CPULIST_FILENAME); +@@ -357,0 +381,31 @@ ++static bool cpuinfo_linux_detect_processors_from_sysconf( ++ uint32_t max_processors_count, ++ uint32_t* processor0_flags, ++ uint32_t processor_struct_size, ++ uint32_t detected_flag, ++ const char* processor_list_name) { ++ const long nproc = sysconf(_SC_NPROCESSORS_ONLN); ++ if (nproc <= 0) { ++ cpuinfo_log_warning( ++ "failed to query online processors from sysconf(_SC_NPROCESSORS_ONLN) for %s", ++ processor_list_name); ++ return false; ++ } ++ ++ uint32_t processors_count = (uint32_t)nproc; ++ if ((uint64_t)nproc > (uint64_t)max_processors_count) { ++ cpuinfo_log_warning( ++ "online processors count %ld exceeds system limit %" PRIu32 ": truncating to the latter", ++ nproc, ++ max_processors_count); ++ processors_count = max_processors_count; ++ } ++ ++ for (uint32_t processor = 0; processor < processors_count; processor++) { ++ *((uint32_t*)((uintptr_t)processor0_flags + processor_struct_size * processor)) |= detected_flag; ++ } ++ cpuinfo_log_warning( ++ "falling back to sysconf(_SC_NPROCESSORS_ONLN) = %ld for %s", nproc, processor_list_name); ++ return true; ++} ++ +@@ -373 +427,6 @@ +- return false; ++ return cpuinfo_linux_detect_processors_from_sysconf( ++ max_processors_count, ++ processor0_flags, ++ processor_struct_size, ++ possible_flag, ++ POSSIBLE_CPULIST_FILENAME); +@@ -392 +451,6 @@ +- return false; ++ return cpuinfo_linux_detect_processors_from_sysconf( ++ max_processors_count, ++ processor0_flags, ++ processor_struct_size, ++ present_flag, ++ PRESENT_CPULIST_FILENAME); diff --git a/cmake/patches/cutlass/cutlass_4.2.1.patch b/cmake/patches/cutlass/cutlass_4.4.2.patch similarity index 100% rename from cmake/patches/cutlass/cutlass_4.2.1.patch rename to cmake/patches/cutlass/cutlass_4.4.2.patch diff --git a/include/onnxruntime/ep/adapter/allocator.h b/include/onnxruntime/ep/adapter/allocator.h index 4e601bb22252b..1798be23e4ed0 100644 --- a/include/onnxruntime/ep/adapter/allocator.h +++ b/include/onnxruntime/ep/adapter/allocator.h @@ -8,6 +8,7 @@ #endif #include +#include #include "core/framework/allocator.h" @@ -15,6 +16,59 @@ namespace onnxruntime { namespace ep { namespace adapter { +// Wraps an OrtAllocator* exposed by the C API as an IAllocator. +// Takes ownership of the wrapped Ort::Allocator and releases it on destruction. +class IAllocatorWrappingOrtAllocator final : public IAllocator { + public: + explicit IAllocatorWrappingOrtAllocator(Ort::Allocator ort_allocator) + : IAllocator(*(EnsureOrtAllocatorHasValue(ort_allocator).GetInfo())), + ort_allocator_(std::move(ort_allocator)) { + } + + void* Alloc(size_t size) override { + return ort_allocator_.Alloc(size); + } + + void Free(void* p) override { + ort_allocator_.Free(p); + } + + void* Reserve(size_t size) override { + return ort_allocator_.Reserve(size); + } + + bool IsStreamAware() const override { + return false; + + // TODO: Enable once AllocOnStream() is implemented. + // static constexpr uint32_t kOrtAllocatorAllocOnStreamMinVersion = 23; + // const OrtAllocator* raw = ort_allocator_; + // return raw->version >= kOrtAllocatorAllocOnStreamMinVersion && raw->AllocOnStream != nullptr; + } + + void* AllocOnStream(size_t /*size*/, Stream* /*stream*/) override { + // TODO: Implement AllocOnStream(). + // The internal `onnxruntime::IAllocator::AllocOnStream` signature takes an internal `onnxruntime::Stream*` + // argument, while the public `::OrtAllocator::AllocOnStream` signature takes an `::OrtSyncStream*` argument. + // We need to properly map from one to the other. + // `::OrtSyncStream*` should be treated as an opaque type from the plugin EP's perspective. + ORT_NOT_IMPLEMENTED("IAllocatorWrappingOrtAllocator::AllocOnStream is not implemented yet."); + } + + private: + static const Ort::Allocator& EnsureOrtAllocatorHasValue(const Ort::Allocator& ort_allocator) { + ORT_ENFORCE(ort_allocator != nullptr, "Ort::Allocator must contain a non-nullptr OrtAllocator."); + return ort_allocator; + } + + // TODO: Consider adding GetStats() override. Requires parsing OrtKeyValuePairs from the C API + // into AllocatorStats; see GetStatsFromOrtAllocator() in allocator_adapters.cc for reference. + + Ort::Allocator ort_allocator_; + + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(IAllocatorWrappingOrtAllocator); +}; + /// /// A bridge class between the EP API OrtAllocator and an IAllocator implementation. /// diff --git a/include/onnxruntime/ep/adapter/op_kernel_info.h b/include/onnxruntime/ep/adapter/op_kernel_info.h index 7e61385f3686c..417ebd4adf7a2 100644 --- a/include/onnxruntime/ep/adapter/op_kernel_info.h +++ b/include/onnxruntime/ep/adapter/op_kernel_info.h @@ -8,14 +8,16 @@ #endif #include +#include +#include "core/common/inlined_containers.h" #include "core/common/narrow.h" #include "core/common/status.h" #include "core/framework/config_options.h" -#include "core/framework/op_kernel_info.h" #include "core/framework/tensor_shape.h" #include "core/framework/tensor.h" +#include "allocator.h" #include "node.h" #include "kernel_def.h" #include "tensor_helper.h" @@ -43,12 +45,11 @@ struct OpKernelInfo { // to manage the lifetime of the cached data. struct KernelInfoCache { explicit KernelInfoCache(const OrtKernelInfo* kernel_info) : kernel_info_(kernel_info) { - const auto* core_kernel_info = reinterpret_cast(kernel_info); - execution_provider_ = core_kernel_info->GetExecutionProvider(); - ort_ep_ = execution_provider_ != nullptr ? execution_provider_->GetOrtEp() : nullptr; - ep_impl_ = ort_ep_ != nullptr ? (static_cast(ort_ep_))->EpImpl() : execution_provider_; - Ort::ConstKernelInfo info{kernel_info}; + ort_ep_ = info.GetEp(); + ORT_ENFORCE(ort_ep_ != nullptr, "Plugin EP adapter requires a non-null OrtEp"); + ep_impl_ = static_cast(ort_ep_)->EpImpl(); + const size_t input_count = info.GetInputCount(); constant_input_tensors.resize(input_count); for (size_t i = 0; i < input_count; ++i) { @@ -60,10 +61,13 @@ struct OpKernelInfo { } } const OrtKernelInfo* kernel_info_; - const ::onnxruntime::IExecutionProvider* execution_provider_{}; const OrtEp* ort_ep_{}; const ::onnxruntime::IExecutionProvider* ep_impl_{}; std::vector constant_input_tensors; + + mutable std::shared_mutex allocator_cache_mutex_; + mutable InlinedHashMap allocator_cache_; + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(KernelInfoCache); }; @@ -74,11 +78,34 @@ struct OpKernelInfo { return (static_cast(cache_->ort_ep_))->GetDataTransferManager(); } - // Delegates to the core OpKernelInfo::GetAllocator so the adapter returns - // exactly the same allocator the framework would provide for each OrtMemType. AllocatorPtr GetAllocator(OrtMemType mem_type) const { - const auto* core_kernel_info = reinterpret_cast(cache_->kernel_info_); - return core_kernel_info->GetAllocator(mem_type); + { + std::shared_lock lock(cache_->allocator_cache_mutex_); + auto it = cache_->allocator_cache_.find(mem_type); + if (it != cache_->allocator_cache_.end()) { + return it->second; + } + } + + std::unique_lock lock(cache_->allocator_cache_mutex_); + // Double-check after acquiring exclusive lock + auto it = cache_->allocator_cache_.find(mem_type); + if (it != cache_->allocator_cache_.end()) { + return it->second; + } + + OrtAllocator* ort_allocator_raw = nullptr; + Ort::Status status(Ort::GetApi().KernelInfoGetAllocator(cache_->kernel_info_, mem_type, &ort_allocator_raw)); + + if (!status.IsOK() || ort_allocator_raw == nullptr) { + cache_->allocator_cache_.emplace(mem_type, nullptr); + return nullptr; + } + + Ort::Allocator ort_allocator{ort_allocator_raw}; + auto allocator = std::make_shared(std::move(ort_allocator)); + cache_->allocator_cache_.emplace(mem_type, allocator); + return allocator; } Node node() const noexcept { diff --git a/js/react_native/README.md b/js/react_native/README.md index f7b118e81573d..d57dbad2b37f8 100644 --- a/js/react_native/README.md +++ b/js/react_native/README.md @@ -16,6 +16,18 @@ With ONNX Runtime React Native, React Native developers can score pre-trained ON npm install onnxruntime-react-native ``` +React Native's autolinking registers the native Android and iOS modules automatically. No manual changes to `settings.gradle`, `build.gradle`, or `MainApplication` are required for bare React Native projects. + +For Expo managed/prebuild workflows, add the config plugin to your `app.json`/`app.config.js`: + +```json +{ + "plugins": ["onnxruntime-react-native"] +} +``` + +Then run `npx expo prebuild` to apply the native changes. + ### Usage ```js diff --git a/js/react_native/app.plugin.js b/js/react_native/app.plugin.js index 2fa117b1a14e5..7f6bd8b55dae3 100644 --- a/js/react_native/app.plugin.js +++ b/js/react_native/app.plugin.js @@ -23,6 +23,56 @@ const withOrt = (config) => { return config; }); + // Register OnnxruntimePackage in MainApplication for New Architecture / Expo prebuild + config = configPlugin.withMainApplication(config, (config) => { + const lang = config.modResults.language; + if (lang === 'kt') { + config.modResults.contents = generateCode.mergeContents({ + src: config.modResults.contents, + newSrc: 'import ai.onnxruntime.reactnative.OnnxruntimePackage', + tag: 'onnxruntime-react-native-import', + anchor: /^import /m, + offset: 0, + comment: '//', + }).contents; + config.modResults.contents = generateCode.mergeContents({ + src: config.modResults.contents, + newSrc: ' add(OnnxruntimePackage())', + tag: 'onnxruntime-react-native-package', + anchor: /override fun getPackages\(\)/, + offset: 2, + comment: '//', + }).contents; + } else if (lang === 'java') { + config.modResults.contents = generateCode.mergeContents({ + src: config.modResults.contents, + newSrc: 'import ai.onnxruntime.reactnative.OnnxruntimePackage;', + tag: 'onnxruntime-react-native-import', + anchor: /^import /m, + offset: 0, + comment: '//', + }).contents; + if (!config.modResults.contents.includes('packages.add(new OnnxruntimePackage())')) { + if (/return\s+new PackageList\(this\)\.getPackages\(\);/.test(config.modResults.contents)) { + config.modResults.contents = config.modResults.contents.replace( + /(\s*)return\s+new PackageList\(this\)\.getPackages\(\);/, + '$1List packages = new PackageList(this).getPackages();\n$1packages.add(new OnnxruntimePackage());\n$1return packages;', + ); + } else { + config.modResults.contents = generateCode.mergeContents({ + src: config.modResults.contents, + newSrc: ' packages.add(new OnnxruntimePackage());', + tag: 'onnxruntime-react-native-package', + anchor: /^\s*List\s+packages\s*=\s*new PackageList\(this\)\.getPackages\(\);\s*$/m, + offset: 1, + comment: '//', + }).contents; + } + } + } + return config; + }); + // Add build dependency to pod file config = configPlugin.withDangerousMod(config, [ 'ios', diff --git a/js/react_native/package.json b/js/react_native/package.json index 854e66c6f7239..b518adf14b327 100644 --- a/js/react_native/package.json +++ b/js/react_native/package.json @@ -49,6 +49,7 @@ "ios/*.mm", "onnxruntime-react-native.podspec", "app.plugin.js", + "react-native.config.js", "unimodule.json", "!dist/commonjs/*.js.map", "!dist/module/*.js.map", diff --git a/js/react_native/react-native.config.js b/js/react_native/react-native.config.js new file mode 100644 index 0000000000000..87759a6f45ad9 --- /dev/null +++ b/js/react_native/react-native.config.js @@ -0,0 +1,11 @@ +module.exports = { + dependency: { + platforms: { + android: { + packageImportPath: 'import ai.onnxruntime.reactnative.OnnxruntimePackage;', + packageInstance: 'new OnnxruntimePackage()', + }, + ios: {}, + }, + }, +}; diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts index fe2567e71d49a..9bbad9839d616 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts @@ -62,6 +62,14 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi } } + if (sequenceLength > maxSequenceLength) { + throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported'); + } + + // Note: position_ids value validation is handled by shader-side bounds checks (defense-in-depth). + // We cannot validate position_ids values here because the tensor is GPU-resident — its data field + // is a GPU buffer ID, not a WASM heap pointer, so getBigInt64Array() would read garbage. + if (headSize / 2 !== cosCache.dims[1] && rotaryEmbeddingDim / 2 !== cosCache.dims[1]) { throw new Error( `Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got ${ @@ -69,10 +77,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi }`, ); } - - if (sequenceLength > maxSequenceLength) { - throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported'); - } }; export const createRotaryEmbeddingProgramInfo = ( diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_data.h b/onnxruntime/contrib_ops/cuda/bert/attention_data.h index c54a1fea9ad3a..98f92b79e6ec6 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_data.h +++ b/onnxruntime/contrib_ops/cuda/bert/attention_data.h @@ -237,11 +237,27 @@ struct PagedAttentionData { // Fused op buffers T* workspace_buffer = nullptr; + // Memory-efficient attention (CUTLASS fMHA) buffers for the unfused fallback path + // taken when FlashAttention is unavailable (SM<80 or ORT_DISABLE_FLASH_ATTENTION). + T* gathered_key = nullptr; // [total_kv_tokens, num_heads, head_size], packed varlen (GQA-expanded) + T* gathered_value = nullptr; // [total_kv_tokens, num_heads, head_size], packed varlen (GQA-expanded) + T* fmha_buffer = nullptr; // CUTLASS fMHA output-accumulator workspace + // Populated by the caller after a D->H sync on cumulative_seqlens_kv[batch_size]. + int total_kv_tokens = 0; + + // Actual max of per-batch new-query lengths (cumulative_seqlens_q[i+1] - cumulative_seqlens_q[i]). + // Populated by the caller via the same D->H sync so the MEA path's rotary grid and MEA's + // grid_x (ceil_div(sequence_length, kQueriesPerBlock)) cover every query token. The previous + // heuristic `token_count - batch_size + 1` underestimates when any batch has 0 new tokens, + // producing silent per-token dropout in MEA and rotary. + int max_query_len = 0; + // Output Tensors T* output = nullptr; // Kernel Flags bool use_flash_attention = false; + bool use_memory_efficient_attention = false; }; } // namespace cuda diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc b/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc index 5df2c8b438771..7fba61270e280 100644 --- a/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc @@ -9,6 +9,7 @@ #include "contrib_ops/cuda/bert/paged_attention.h" #include "contrib_ops/cuda/bert/paged_attention_helper.h" #include "contrib_ops/cuda/bert/flash_attention/flash_api.h" +#include "contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h" using namespace onnxruntime::cuda; using namespace ::onnxruntime::common; @@ -50,6 +51,7 @@ PagedAttention::PagedAttention(const OpKernelInfo& info) kernel_options_ = this->GetAttentionKernelOptions(); disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention(); + disable_memory_efficient_attention_ = sizeof(T) != 2 || !kernel_options_->UseEfficientAttention(); } template @@ -141,31 +143,57 @@ Status PagedAttention::ComputeInternal(OpKernelContext* context) const { "value_cache and value_cache_out must be the same buffer"); } - // Check flash kernel availability and allocate buffers + // Empty query input: output is already shaped [0, hidden_size], and the cache outputs + // alias the input caches (verified above), so no backend kernel or cache update is needed. + if (parameters.token_count == 0) { + return Status::OK(); + } + + // Kernel backend selection — FlashAttention preferred, fall back to MemoryEfficientAttention. #if USE_FLASH_ATTENTION bool use_flash_attention = !disable_flash_attention_ && onnxruntime::flash::is_supported(device_prop, parameters.head_size, parameters.num_heads, parameters.kv_num_heads); - size_t softmax_lse_bytes = 0; - if (use_flash_attention) { - softmax_lse_bytes = onnxruntime::flash::get_softmax_lse_size(parameters.token_count, - parameters.num_heads); - } - auto softmax_lse_buffer = GetScratchBuffer(softmax_lse_bytes, GetComputeStream(context)); #else constexpr bool use_flash_attention = false; - auto softmax_lse_buffer = GetScratchBuffer(0, GetComputeStream(context)); // nullptr #endif - if (!use_flash_attention) { +#if USE_MEMORY_EFFICIENT_ATTENTION + const int sm = device_prop.major * 10 + device_prop.minor; + const bool is_half = std::is_same::value; + const bool is_bf16 = std::is_same::value; + bool use_memory_efficient_attention = + !use_flash_attention && + !disable_memory_efficient_attention_ && + has_memory_efficient_attention(sm, is_half, is_bf16, + parameters.head_size, parameters.head_size); +#else + constexpr bool use_memory_efficient_attention = false; +#endif + + if (!use_flash_attention && !use_memory_efficient_attention) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Currently PagedAttention is only supported through the FlashAttention kernel."); + "PagedAttention requires FlashAttention (sm>=80, fp16/bf16) or " + "MemoryEfficientAttention (fp16 sm>=53, bf16 sm>=80, head_size<=1024 and %8==0) " + "to be available. Check ORT_DISABLE_FLASH_ATTENTION / " + "ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION env vars and dtype/head_size."); } + // Scratch buffers common to both backends. + size_t softmax_lse_bytes = 0; +#if USE_FLASH_ATTENTION + if (use_flash_attention) { + softmax_lse_bytes = onnxruntime::flash::get_softmax_lse_size(parameters.token_count, + parameters.num_heads); + } +#endif + auto softmax_lse_buffer = GetScratchBuffer(softmax_lse_bytes, GetComputeStream(context)); + size_t cumulative_seqlens_kv_bytes = sizeof(int) * (parameters.batch_size + 1); auto cumulative_seqlens_kv_buffer = GetScratchBuffer(cumulative_seqlens_kv_bytes, GetComputeStream(context)); + int* cumulative_seqlens_kv_ptr = reinterpret_cast(cumulative_seqlens_kv_buffer.get()); size_t workspace_buffer_bytes = 0; if (do_rotary_) { @@ -175,10 +203,91 @@ Status PagedAttention::ComputeInternal(OpKernelContext* context) const { } auto workspace_buffer = GetScratchBuffer(workspace_buffer_bytes, GetComputeStream(context)); + // Populate cumulative_seqlens_kv for both backends. The MEA path additionally needs + // the last element on the host to size the tight gather buffers, so we D->H sync below. + // + // LaunchGetCumulativeSeqlensKV uses a per-block cub::BlockScan with a block size of 256 + // and launches (batch_size + 255) / 256 blocks, so blocks scan independently. Enforce + // batch_size <= 256 so the cumulative sum is correct; a larger batch would silently + // produce wrong KV offsets. (A future grid-wide scan could lift this limit.) + constexpr int kMaxBatchSizeForCumulativeSeqlensKV = 256; + if (parameters.batch_size > kMaxBatchSizeForCumulativeSeqlensKV) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "PagedAttention currently supports batch_size <= ", + kMaxBatchSizeForCumulativeSeqlensKV, + " (LaunchGetCumulativeSeqlensKV limitation); got batch_size=", + parameters.batch_size, "."); + } + + cudaStream_t cuda_stream = static_cast(ort_stream.get()->GetHandle()); + ORT_RETURN_IF_ERROR(LaunchGetCumulativeSeqlensKV( + cumulative_seqlens_kv_ptr, + reinterpret_cast(cumulative_seqlens_q->Data()), + reinterpret_cast(past_seqlens->Data()), + parameters.batch_size, cuda_stream)); + + int total_kv_tokens = 0; + int max_query_len = 0; + IAllocatorUniquePtr gathered_key_buffer; + IAllocatorUniquePtr gathered_value_buffer; + IAllocatorUniquePtr fmha_buffer; + +#if USE_MEMORY_EFFICIENT_ATTENTION + if (use_memory_efficient_attention) { + // MEA needs two host-side quantities: + // - total_kv_tokens (= cumulative_seqlens_kv[batch_size]) to size tight gather buffers. + // - max_query_len (= max per-batch new-query length) to size the rotary and MEA grids + // correctly. The heuristic `token_count - batch_size + 1` underestimates when any + // batch has 0 new tokens (valid input), silently dropping query-tokens from those + // larger-than-average batches. + // Both come from cumulative_seqlens_q / cumulative_seqlens_kv, which are tiny (batch+1 + // ints each), so one D->H copy of the full arrays is cheaper than issuing an extra + // reduction kernel and avoids a second sync. + const int kCumulativeCount = parameters.batch_size + 1; + auto cum_q_pinned = this->AllocateBufferOnCPUPinned(kCumulativeCount); + auto cum_kv_pinned = this->AllocateBufferOnCPUPinned(kCumulativeCount); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cum_q_pinned.get(), + reinterpret_cast(cumulative_seqlens_q->Data()), + sizeof(int) * kCumulativeCount, cudaMemcpyDeviceToHost, cuda_stream)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cum_kv_pinned.get(), cumulative_seqlens_kv_ptr, + sizeof(int) * kCumulativeCount, cudaMemcpyDeviceToHost, cuda_stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream)); + total_kv_tokens = cum_kv_pinned.get()[parameters.batch_size]; + for (int i = 0; i < parameters.batch_size; ++i) { + const int q_len_i = cum_q_pinned.get()[i + 1] - cum_q_pinned.get()[i]; + if (q_len_i > max_query_len) { + max_query_len = q_len_i; + } + } + if (total_kv_tokens == 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "PagedAttention MEA fallback: total_kv_tokens is zero for non-empty input."); + } + if (total_kv_tokens < 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "PagedAttention MEA fallback: total_kv_tokens is negative (", total_kv_tokens, ")."); + } + + const size_t gather_elems = static_cast(total_kv_tokens) * + parameters.num_heads * parameters.head_size; + gathered_key_buffer = GetScratchBuffer(sizeof(T) * gather_elems, GetComputeStream(context)); + gathered_value_buffer = GetScratchBuffer(sizeof(T) * gather_elems, GetComputeStream(context)); + + if (MemoryEfficientAttentionParams::need_workspace(parameters.head_size, sizeof(T) == sizeof(float))) { + // MEA output accumulator is float32 regardless of input dtype (see GQA pattern at + // group_query_attention.cc:482); use sizeof(float), not sizeof(T). + const size_t fmha_elems = static_cast(parameters.token_count) * + parameters.num_heads * parameters.head_size; + fmha_buffer = GetScratchBuffer(sizeof(float) * fmha_elems, GetComputeStream(context)); + } + } +#endif + // Print debug info if (kernel_options_->AllowDebugInfo()) { AttentionKernelDebugInfo debug_info; debug_info.use_flash_attention = use_flash_attention; + debug_info.use_efficient_attention = use_memory_efficient_attention; debug_info.Print("PagedAttention", this->Node().Name(), @@ -194,10 +303,11 @@ Status PagedAttention::ComputeInternal(OpKernelContext* context) const { data.value_cache = reinterpret_cast(const_cast(value_cache->Data())); data.cumulative_seqlens_q = reinterpret_cast(cumulative_seqlens_q->Data()); data.past_seqlens = reinterpret_cast(past_seqlens->Data()); - data.cumulative_seqlens_kv = reinterpret_cast(cumulative_seqlens_kv_buffer.get()); + data.cumulative_seqlens_kv = cumulative_seqlens_kv_ptr; data.block_table = reinterpret_cast(block_table->Data()); data.output = reinterpret_cast(output->MutableData()); data.use_flash_attention = use_flash_attention; + data.use_memory_efficient_attention = use_memory_efficient_attention; if (softmax_lse_buffer != nullptr) { data.softmax_lse = reinterpret_cast(softmax_lse_buffer.get()); } @@ -208,6 +318,15 @@ Status PagedAttention::ComputeInternal(OpKernelContext* context) const { data.cos_cache = reinterpret_cast(cos_cache->Data()); data.sin_cache = reinterpret_cast(sin_cache->Data()); } + if (use_memory_efficient_attention) { + data.gathered_key = reinterpret_cast(gathered_key_buffer.get()); + data.gathered_value = reinterpret_cast(gathered_value_buffer.get()); + if (fmha_buffer != nullptr) { + data.fmha_buffer = reinterpret_cast(fmha_buffer.get()); + } + data.total_kv_tokens = total_kv_tokens; + data.max_query_len = max_query_len; + } cublasHandle_t cublas = GetCublasHandle(context); diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention.h b/onnxruntime/contrib_ops/cuda/bert/paged_attention.h index a3df144745f61..027141f02b9ae 100644 --- a/onnxruntime/contrib_ops/cuda/bert/paged_attention.h +++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention.h @@ -29,6 +29,7 @@ class PagedAttention final : public CudaKernel { float scale_; float softcap_; bool disable_flash_attention_; + bool disable_memory_efficient_attention_; const AttentionKernelOptions* kernel_options_; }; diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu index 06608ebed44cc..2241fa232a2c0 100644 --- a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu @@ -9,6 +9,7 @@ #include "contrib_ops/cuda/bert/attention_softmax.h" #include "contrib_ops/cuda/utils/dump_cuda_tensor.h" #include "contrib_ops/cuda/bert/flash_attention/flash_api.h" +#include "contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h" #include "contrib_ops/cuda/bert/paged_attention_impl.h" #include "core/providers/cuda/shared_inc/cuda_call.h" #include "contrib_ops/cuda/bert/rotary_embedding_impl.h" @@ -237,6 +238,101 @@ Status LaunchReshapeAndCache(const T* key, const T* value, T* key_cache, T* valu return CUDA_CALL(cudaGetLastError()); } +// Gather paged KV into packed-varlen [total_kv_tokens, num_heads, head_size], expanding GQA heads. +// total_elems = total_kv_tokens * num_heads * head_size can exceed INT32_MAX for realistic +// large-context GQA configs (e.g., 2M tokens * 64 * 128 = 16.4B), so the linear index is int64_t +// and the kernel uses a grid-stride loop instead of a single (tid >= total_elems) early-exit. +template +__global__ void GatherAndExpandPagedKVCache(const T* __restrict__ key_cache, + const T* __restrict__ value_cache, + T* __restrict__ gathered_key, + T* __restrict__ gathered_value, + const int* __restrict__ block_table, + const int* __restrict__ cumulative_seqlens_kv, + const int batch_size, + const int num_heads, + const int kv_num_heads, + const int head_size, + const int block_size, + const int max_num_blocks_per_seq, + const int64_t total_elems) { + const int64_t stride = static_cast(gridDim.x) * blockDim.x; + const int64_t num_heads_times_head = static_cast(num_heads) * head_size; + const int q_kv_head_ratio = num_heads / kv_num_heads; + const int64_t page_stride = static_cast(block_size) * kv_num_heads * head_size; + + for (int64_t tid = threadIdx.x + static_cast(blockIdx.x) * blockDim.x; + tid < total_elems; + tid += stride) { + const int h = static_cast(tid % head_size); + const int head_id = static_cast((tid / head_size) % num_heads); + const int token_id = static_cast(tid / num_heads_times_head); + + // cumulative_seqlens_kv is a prefix sum of non-negative per-batch KV lengths + // (past_seqlens[i] + new_tokens[i]), so it is monotonically non-decreasing for + // any valid op input — the same assumption the previous linear scan made. + // Binary-search for the batch this token belongs to: log2(batch_size) is strictly + // better than the linear scan, which ran once per (token, head, h) element and + // multiplied its cost by num_heads * head_size. + int left = 0; + int right = batch_size; + while (left < right) { + const int mid = left + (right - left) / 2; + if (token_id < cumulative_seqlens_kv[mid + 1]) { + right = mid; + } else { + left = mid + 1; + } + } + const int batch_id = left; + + const int pos = token_id - cumulative_seqlens_kv[batch_id]; + const int block_idx_in_seq = pos / block_size; + const int block_offset = pos % block_size; + const int block_id = block_table[batch_id * max_num_blocks_per_seq + block_idx_in_seq]; + + // GQA expansion: each output head maps to kv_head_id = head_id / (num_heads / kv_num_heads). + // For MHA (num_heads == kv_num_heads) this is the identity. + const int kv_head_id = head_id / q_kv_head_ratio; + + const int64_t paged_idx = static_cast(block_id) * page_stride + + static_cast(block_offset) * kv_num_heads * head_size + + kv_head_id * head_size + + h; + + gathered_key[tid] = key_cache[paged_idx]; + gathered_value[tid] = value_cache[paged_idx]; + } +} + +template +Status LaunchGatherAndExpandPagedKVCache(const T* key_cache, const T* value_cache, + T* gathered_key, T* gathered_value, + const int* block_table, const int* cumulative_seqlens_kv, + const int batch_size, const int num_heads, + const int kv_num_heads, const int head_size, + const int block_size, const int max_num_blocks_per_seq, + const int total_kv_tokens, cudaStream_t stream, + const int max_threads_per_block) { + const int64_t total_elems = static_cast(total_kv_tokens) * num_heads * head_size; + if (total_elems == 0) { + return Status::OK(); + } + // With the op's batch_size <= 256 precondition (paged_attention.cc) and MEA's + // head_size <= 1024 cap, blocks_needed = ceil(total_elems / threads) stays comfortably + // within int range for any realistic input, so no explicit clamp is needed. The kernel + // uses a grid-stride loop so launching fewer blocks than total_elems / threads would + // also be correct — we don't need an artificial "keep SMs busy" cap. + const int threads = static_cast(std::min(max_threads_per_block, total_elems)); + const int blocks = static_cast((total_elems + threads - 1) / threads); + GatherAndExpandPagedKVCache<<>>( + key_cache, value_cache, gathered_key, gathered_value, + block_table, cumulative_seqlens_kv, + batch_size, num_heads, kv_num_heads, head_size, + block_size, max_num_blocks_per_seq, total_elems); + return CUDA_CALL(cudaGetLastError()); +} + ////////// Launch Kernels #if USE_FLASH_ATTENTION @@ -276,12 +372,11 @@ Status FlashAttention( value = reinterpret_cast(key) + static_cast(kv_num_heads * head_size); } - // Calculate cumulative present sequence length in cumulative_seqlens_kv + // cumulative_seqlens_kv is populated by the caller (paged_attention.cc) before QkvToContext; + // shared across FA and MEA dispatch paths so the host can also read total_kv_tokens. int* cumulative_seqlens_q = const_cast(data.cumulative_seqlens_q); int* past_seqlens = const_cast(data.past_seqlens); int* cumulative_seqlens_kv = data.cumulative_seqlens_kv; - ORT_RETURN_IF_ERROR(LaunchGetCumulativeSeqlensKV(cumulative_seqlens_kv, cumulative_seqlens_q, past_seqlens, - batch_size, stream)); if (parameters.do_rotary) { // Will unpack Q and K in case of packed_qkv @@ -335,6 +430,127 @@ Status FlashAttention( } #endif +#if USE_MEMORY_EFFICIENT_ATTENTION +// Fallback when FlashAttention is unavailable (SM<80 or ORT_DISABLE_FLASH_ATTENTION=1). +// Mirrors the FlashAttention preprocessing (rotary, unpack, ReshapeAndCache), then gathers +// the paged KV cache into a packed-varlen [total_kv_tokens, num_heads, head_size] buffer and +// dispatches to CUTLASS memory-efficient attention via its seqstart_q / seqstart_k varlen ABI. +// Caller must populate data.gathered_key / data.gathered_value / data.total_kv_tokens. +template +Status EfficientAttention( + const cudaDeviceProp& device_prop, + cudaStream_t stream, + contrib::PagedAttentionParameters& parameters, + PagedAttentionData& data, + float scale) { + const int max_threads_per_block = device_prop.maxThreadsPerBlock; + const int batch_size = parameters.batch_size; + const int token_count = parameters.token_count; + const int q_hidden_size = parameters.hidden_size; + const int kv_hidden_size = parameters.kv_hidden_size; + const int num_heads = parameters.num_heads; + const int kv_num_heads = parameters.kv_num_heads; + const int head_size = parameters.head_size; + const int block_size = parameters.block_size; + const int max_num_blocks_per_seq = parameters.max_num_blocks_per_seq; + const int local_window_size = parameters.local_window_size; + const int total_kv_tokens = data.total_kv_tokens; + // Use the caller-computed actual max of per-batch new-query lengths, not the + // `token_count - batch_size + 1` heuristic: the heuristic assumes >=1 new token per batch + // and underestimates otherwise, which would silently drop query tokens from the + // rotary grid and from MEA's `grid_x = ceil_div(sequence_length, kQueriesPerBlock)`. + const int max_query_len = data.max_query_len; + + T* query = const_cast(data.query); + T* key; + T* value; + if (!parameters.is_packed_qkv) { + key = const_cast(data.key); + value = const_cast(data.value); + } else { + key = reinterpret_cast(query) + static_cast(num_heads * head_size); + value = reinterpret_cast(key) + static_cast(kv_num_heads * head_size); + } + + // cumulative_seqlens_kv is populated by the caller (paged_attention.cc) before QkvToContext; + // shared across FA and MEA dispatch paths. + int* cumulative_seqlens_q = const_cast(data.cumulative_seqlens_q); + int* past_seqlens = const_cast(data.past_seqlens); + int* cumulative_seqlens_kv = data.cumulative_seqlens_kv; + + if (parameters.do_rotary) { + auto q_buffer = data.workspace_buffer; + auto k_buffer = data.workspace_buffer + token_count * num_heads * head_size; + const int packed_seq_stride = parameters.is_packed_qkv ? (num_heads + 2 * kv_num_heads) * head_size : -1; + ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel( + stream, q_buffer, query, past_seqlens, cumulative_seqlens_q, data.cos_cache, data.sin_cache, batch_size, + max_query_len, num_heads, head_size, parameters.rotary_dim, parameters.rotary_interleaved, packed_seq_stride, + max_threads_per_block)); + ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel( + stream, k_buffer, key, past_seqlens, cumulative_seqlens_q, data.cos_cache, data.sin_cache, batch_size, + max_query_len, kv_num_heads, head_size, parameters.rotary_dim, parameters.rotary_interleaved, packed_seq_stride, + max_threads_per_block)); + query = q_buffer; + key = k_buffer; + } else if (parameters.is_packed_qkv) { + auto q_buffer = data.workspace_buffer; + const int packed_seq_stride = q_hidden_size + 2 * kv_hidden_size; + ORT_RETURN_IF_ERROR(LaunchUnpackCumulative( + query, q_buffer, token_count, q_hidden_size, packed_seq_stride, stream, max_threads_per_block)); + query = q_buffer; + } + + int* block_table = const_cast(data.block_table); + const int key_stride = parameters.is_packed_qkv && !parameters.do_rotary ? q_hidden_size + 2 * kv_hidden_size : kv_hidden_size; + const int value_stride = parameters.is_packed_qkv ? q_hidden_size + 2 * kv_hidden_size : kv_hidden_size; + ORT_RETURN_IF_ERROR(LaunchReshapeAndCache(key, value, data.key_cache, data.value_cache, block_table, past_seqlens, + cumulative_seqlens_q, batch_size, max_num_blocks_per_seq, token_count, + kv_hidden_size, block_size, key_stride, value_stride, stream, + max_threads_per_block)); + + ORT_RETURN_IF_ERROR(LaunchGatherAndExpandPagedKVCache( + data.key_cache, data.value_cache, data.gathered_key, data.gathered_value, + block_table, cumulative_seqlens_kv, batch_size, num_heads, kv_num_heads, + head_size, block_size, max_num_blocks_per_seq, total_kv_tokens, stream, max_threads_per_block)); + + MemoryEfficientAttentionParams p; + p.sm = device_prop.major * 10 + device_prop.minor; + p.is_bf16 = std::is_same::value; + p.is_half = !p.is_bf16 && (sizeof(T) == 2); + p.batch_size = batch_size; + p.num_heads = num_heads; + p.sequence_length = max_query_len; + p.kv_sequence_length = total_kv_tokens; + p.max_sequence_length = total_kv_tokens; + p.qk_head_size = head_size; + p.v_head_size = head_size; + p.causal = true; + p.scale = scale; + p.softcap = parameters.softcap; + p.local_window_size = local_window_size; + p.seqstart_q_ptr = cumulative_seqlens_q; + p.seqstart_k_ptr = cumulative_seqlens_kv; + p.seqlen_k_ptr = nullptr; + p.query = query; + p.key = data.gathered_key; + p.value = data.gathered_value; + p.attn_bias = nullptr; + p.is_kv_bsnh = true; + p.has_custom_right_padding = false; + p.output = data.output; + p.workspace = MemoryEfficientAttentionParams::need_workspace(head_size, sizeof(T) == sizeof(float)) + ? data.fmha_buffer + : nullptr; + p.stream = stream; + run_memory_efficient_attention(p); + + DUMP_TENSOR_INIT(); + DUMP_TENSOR("mea paged attention output", data.output, token_count, num_heads, head_size); + + return Status::OK(); +} +#endif + ////////// API Functions template @@ -353,7 +569,13 @@ Status QkvToContext( } #endif - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unfused Paged Attention not implemented."); +#if USE_MEMORY_EFFICIENT_ATTENTION + if (data.use_memory_efficient_attention) { + return EfficientAttention(device_prop, stream, parameters, data, scale); + } +#endif + + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "No PagedAttention kernel available for the current configuration."); } template struct PagedAttentionData; diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h index 7e27556a5c63f..22f9793be0af6 100644 --- a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h @@ -27,6 +27,11 @@ Status LaunchUnpackQKVCumulative(const T* packed_qkv, T* unpacked_q, T* unpacked const int kv_num_heads, const int head_size, const int token_count, cudaStream_t stream, const int max_threads_per_block); +// Exposed so paged_attention.cc can populate cumulative_seqlens_kv on both the FA and MEA +// dispatch paths (producer hoisted out of FlashAttention/UnfusedAttention in impl.cu). +Status LaunchGetCumulativeSeqlensKV(int32_t* cumulative_seqlens_kv, const int32_t* cumulative_seqlens_q, + const int32_t* past_seqlens, const int batch_size, cudaStream_t stream); + } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc index 9f81e490971cd..69d2db391ce3c 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc @@ -35,13 +35,28 @@ Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const { " if (global_idx >= size) { return; }\n" " if (bsnh[3] < half_rotary_emb_dim) {\n" << " let position_ids_idx = " << position_ids.BroadcastedIndicesToOffset("bsnh.xy", output_indices) << ";\n" - << " let position_id = u32(" << position_ids.GetByOffset("position_ids_idx") << ") + select(0, bsnh[1], position_ids_idx == 0);\n" + << " let raw_pos = " << position_ids.GetByOffset("position_ids_idx") << ";\n" << " let i = dot(bsnh, uniforms.input_output_stride) + select(0, bsnh[3], " << interleaved_str << ");\n" << " let j = i + select(half_rotary_emb_dim, 1, " << interleaved_str << ");\n" - << " let re = " << input.GetByOffset("i") << " * " << cos_cache.GetByIndices("vec2(position_id, bsnh[3])") << " - " << input.GetByOffset("j") << " * " << sin_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" - << " " << output.SetByOffset("i", "re") << "\n" - << " let im = " << input.GetByOffset("i") << " * " << sin_cache.GetByIndices("vec2(position_id, bsnh[3])") << " + " << input.GetByOffset("j") << " * " << cos_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" - << " " << output.SetByOffset("j", "im") << "\n" + " let max_position = uniforms.cos_cache_shape[0];\n" + // Bounds check: raw_pos < 0 catches negative position_ids (i32 from truncated int64). + // After u32 conversion + offset, check >= max_position catches too-large values. + // On OOB, pass through input unchanged (same as CUDA kernel behavior). + " if (raw_pos < 0) {\n" + << " " << output.SetByOffset("i", input.GetByOffset("i")) << "\n" + << " " << output.SetByOffset("j", input.GetByOffset("j")) << "\n" + " } else {\n" + " let position_id = u32(raw_pos) + select(0, bsnh[1], position_ids_idx == 0);\n" + " if (position_id >= max_position) {\n" + << " " << output.SetByOffset("i", input.GetByOffset("i")) << "\n" + << " " << output.SetByOffset("j", input.GetByOffset("j")) << "\n" + " } else {\n" + << " let re = " << input.GetByOffset("i") << " * " << cos_cache.GetByIndices("vec2(position_id, bsnh[3])") << " - " << input.GetByOffset("j") << " * " << sin_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" + << " " << output.SetByOffset("i", "re") << "\n" + << " let im = " << input.GetByOffset("i") << " * " << sin_cache.GetByIndices("vec2(position_id, bsnh[3])") << " + " << input.GetByOffset("j") << " * " << cos_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" + << " " << output.SetByOffset("j", "im") << "\n" + " }\n" + " }\n" << " } else { \n" " let k = dot(bsnh, uniforms.input_output_stride) + half_rotary_emb_dim;\n" << " " << output.SetByOffset("k", input.GetByOffset("k")) << "\n" @@ -74,24 +89,39 @@ Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) c << " let seqlen = u32(seqlen_i);\n" << " let total_seqlen = seqlen + 1u;\n" << " let past_seqlen = total_seqlen - uniforms.q_global_shape[1];\n" + // position_id is derived from past_seqlen + sequence_idx (always non-negative). << " let position_id = past_seqlen + sequence_idx;\n" - << " let cos_v = " << cos_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" - << " let sin_v = " << sin_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" << " let qi = dot(bsnh, uniforms.q_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n" << " let qj = qi + select(half_rotary_dim, 1u, " << interleaved_str << ");\n" - << " let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n" - << " " << q_output.SetByOffset("qi", "q_re") << "\n" - << " let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n" - << " " << q_output.SetByOffset("qj", "q_im") << "\n" + // Bounds check: position_id must be within cos/sin cache range. + // On OOB, pass through input unchanged (same as CUDA kernel behavior). + " let max_position = uniforms.cos_cache_shape[0];\n" + " if (position_id >= max_position) {\n" + << " " << q_output.SetByOffset("qi", q_input.GetByOffset("qi")) << "\n" + << " " << q_output.SetByOffset("qj", q_input.GetByOffset("qj")) << "\n" + << " if (bsnh[2] < uniforms.k_global_shape[2]) {\n" + << " let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n" + << " let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n" + << " " << k_output.SetByOffset("ki", k_input.GetByOffset("ki")) << "\n" + << " " << k_output.SetByOffset("kj", k_input.GetByOffset("kj")) << "\n" + " }\n" + " } else {\n" + << " let cos_v = " << cos_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" + << " let sin_v = " << sin_cache.GetByIndices("vec2(position_id, bsnh[3])") << ";\n" + << " let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n" + << " " << q_output.SetByOffset("qi", "q_re") << "\n" + << " let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n" + << " " << q_output.SetByOffset("qj", "q_im") << "\n" // Conditionally process Key (only for heads that exist in K domain) - << " if (bsnh[2] < uniforms.k_global_shape[2]) {\n" - << " let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n" - << " let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n" - << " let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n" - << " " << k_output.SetByOffset("ki", "k_re") << "\n" - << " let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n" - << " " << k_output.SetByOffset("kj", "k_im") << "\n" - << " }\n" + << " if (bsnh[2] < uniforms.k_global_shape[2]) {\n" + << " let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n" + << " let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n" + << " let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n" + << " " << k_output.SetByOffset("ki", "k_re") << "\n" + << " let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n" + << " " << k_output.SetByOffset("kj", "k_im") << "\n" + " }\n" + " }\n" << " } else {\n" << " let qk = dot(bsnh, uniforms.q_input_output_stride) + half_rotary_dim;\n" << " " << q_output.SetByOffset("qk", q_input.GetByOffset("qk")) << "\n" @@ -127,6 +157,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con const auto half_rotary_embedding_dim = onnxruntime::narrow(cos_cache->Shape()[1]); const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_; + // position_ids bounds validation is handled by shader-side defense-in-depth checks + // (OOB position_ids → pass-through input unchanged). Host-side value scanning is not possible + // because WebGPU program inputs must be GPU buffers (InputMemoryType(OrtMemTypeCPUInput) is + // incompatible with AddInputs). + // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape // [batch size, sequence length, num of heads, num of pairs to rotate + num of dims to copy] // to unfold the global index in shader. diff --git a/onnxruntime/core/framework/sparse_utils.cc b/onnxruntime/core/framework/sparse_utils.cc index c42f6d190512c..48b28db9f9028 100644 --- a/onnxruntime/core/framework/sparse_utils.cc +++ b/onnxruntime/core/framework/sparse_utils.cc @@ -7,6 +7,7 @@ #include "core/common/span_utils.h" #include "core/common/status.h" +#include "core/common/safeint.h" #include "core/framework/tensor.h" #include "core/framework/data_types_internal.h" #include "core/framework/data_transfer_manager.h" @@ -256,16 +257,36 @@ Status SparseCsrToDenseTensor(const DataTransferManager& data_manager, const Spa } void* output = cpu_result.MutableDataRaw(); + const auto dense_size = cpu_result.Shape().Size(); + const auto outer_size = outer_span.size(); + const auto inner_size = static_cast(inner_span.size()); + + // Validate CSR structural invariants (O(1) checks). + if (outer_size > 0) { + ORT_RETURN_IF_NOT(outer_span[0] == 0, + "CSR outer index must start at 0, got: ", outer_span[0]); + ORT_RETURN_IF_NOT(outer_span[outer_size - 1] == inner_size, + "CSR outer index last element must equal inner index count (", + inner_size, "), got: ", outer_span[outer_size - 1]); + } - size_t src_idx = 0; size_t inner_idx = 0; - for (size_t out_i = 1; out_i < outer_span.size(); ++out_i) { + for (size_t out_i = 1; out_i < outer_size; ++out_i) { + ORT_RETURN_IF_NOT(outer_span[out_i] >= outer_span[out_i - 1], + "CSR outer index not non-decreasing at position ", out_i, + ": ", outer_span[out_i]); auto row_size = outer_span[out_i] - outer_span[out_i - 1]; for (int64_t cnt = 0; cnt < row_size; ++cnt, ++inner_idx) { - assert(inner_idx < inner_span.size()); + ORT_RETURN_IF_NOT(inner_idx < inner_span.size(), + "CSR inner index out of range: inner_idx=", inner_idx, + " >= inner_span.size()=", inner_span.size()); auto col = inner_span[inner_idx]; - auto dst_idx = (out_i - 1) * cols + col; - copy_func(output, values, dst_idx, src_idx); + ORT_RETURN_IF_NOT(col >= 0 && col < cols, "Invalid CSR column index: ", col); + // Use SafeInt to prevent overflow during index calculation. + int64_t dst_idx = SafeInt(out_i - 1) * cols + col; + ORT_RETURN_IF_NOT(dst_idx >= 0 && dst_idx < dense_size, + "Invalid CSR computed index: ", dst_idx); + copy_func(output, values, dst_idx, inner_idx); } } } @@ -356,15 +377,22 @@ Status SparseCooToDenseTensor(const DataTransferManager& data_manager, const Spa if (num_indices == num_values) { for (int64_t src_idx = 0; src_idx < num_values; ++src_idx) { auto dst_idx = indices[src_idx]; - ORT_RETURN_IF_NOT(dst_idx < dense_size, "Invalid index: ", dst_idx, " > dense_size: ", dense_size); + ORT_RETURN_IF_NOT(dst_idx >= 0 && dst_idx < dense_size, + "Invalid COO index: ", dst_idx); copy_func(output, values, dst_idx, src_idx); } } else { + const auto rows = src_dims[0]; const auto cols = src_dims[1]; for (int64_t src_idx = 0; src_idx < num_values; ++src_idx) { auto tuple_idx = src_idx * 2; - auto dst_idx = indices[tuple_idx] * cols + indices[tuple_idx + 1]; - ORT_RETURN_IF_NOT(dst_idx < dense_size, "Invalid index: ", dst_idx, " > dense_size: ", dense_size); + auto r = indices[tuple_idx]; + auto c = indices[tuple_idx + 1]; + ORT_RETURN_IF_NOT(r >= 0 && r < rows && c >= 0 && c < cols, + "Invalid COO 2D index: (", r, ", ", c, + ") must be in [0, ", rows, ") x [0, ", cols, ")"); + // Use SafeInt to prevent overflow during index calculation. + int64_t dst_idx = SafeInt(r) * cols + c; copy_func(output, values, dst_idx, src_idx); } } diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 59bde3742b288..3e928afcf6c80 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -2074,7 +2074,7 @@ static Status CopySparseData(const std::string& name, switch (indices.data_type()) { case ONNX_NAMESPACE::TensorProto_DataType_INT64: if (needs_unpack) { - ORT_RETURN_IF_NOT(indices.raw_data().size() == (narrow(indices_elements) * sizeof(int64_t)), + ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int64_t), "Sparse tensor: ", name, " indices raw data size does not match expected: ", indices_elements * sizeof(int64_t)); ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer)); @@ -2088,7 +2088,7 @@ static Status CopySparseData(const std::string& name, break; case ONNX_NAMESPACE::TensorProto_DataType_INT32: { if (needs_unpack) { - ORT_RETURN_IF_NOT(indices.raw_data().size() == (narrow(indices_elements) * sizeof(int32_t)), + ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int32_t), "Sparse tensor: ", name, " indices raw data size does not match expected: ", indices_elements * sizeof(int32_t)); ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer)); @@ -2107,7 +2107,7 @@ static Status CopySparseData(const std::string& name, } case ONNX_NAMESPACE::TensorProto_DataType_INT16: { if (needs_unpack) { - ORT_RETURN_IF_NOT(indices.raw_data().size() == (narrow(indices_elements) * sizeof(int16_t)), + ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt(indices_elements) * sizeof(int16_t), "Sparse tensor: ", name, " indices raw data size does not match expected: ", indices_elements * sizeof(int16_t)); ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer)); @@ -2288,14 +2288,14 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT // by putting the data into a std::string we can avoid a copy as set_raw_data can do a std::move // into the TensorProto. - std::string dense_data_storage(narrow(dense_elements) * element_size, 0); + std::string dense_data_storage(SafeInt(dense_elements) * element_size, 0); if (nnz_elements > 0) { // need to read in sparse data first as it could be in a type specific field, in raw data, or in external data std::vector values_data; ORT_RETURN_IF_ERROR(UnpackInitializerData(sparse_values, model_path, values_data)); - ORT_RETURN_IF_NOT(values_data.size() == static_cast(nnz_elements) * element_size, + ORT_RETURN_IF_NOT(values_data.size() == SafeInt(nnz_elements) * element_size, "Sparse tensor: ", name, " values data size does not match expected: ", - static_cast(nnz_elements) * element_size); + static_cast(SafeInt(nnz_elements) * element_size)); void* sparse_data = values_data.data(); void* dense_data = dense_data_storage.data(); diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index f7c2908d0ab8b..04e99d206bd06 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -60,6 +60,9 @@ Module Name: #if defined(__s390x__) #define MLAS_TARGET_S390X #endif +#if defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64) +#define MLAS_TARGET_RISCV64 +#endif #if defined(__VSX__) #define MLAS_TARGET_POWER diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp index 4916062f2b4f9..a677ee5087672 100644 --- a/onnxruntime/core/mlas/lib/compute.cpp +++ b/onnxruntime/core/mlas/lib/compute.cpp @@ -876,7 +876,7 @@ Return Value: // float Maximum; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64) Maximum = GetMlasPlatform().ReduceMaximumF32Kernel(Input, D); #else Maximum = MlasReduceMaximumF32Kernel(Input, D); @@ -894,7 +894,7 @@ Return Value: float* Temp = LogSoftmax ? nullptr : Output; float Accumulation; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64) Accumulation = GetMlasPlatform().ComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum); #else Accumulation = MlasComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum); @@ -910,7 +910,7 @@ Return Value: // float Parameters[] = {NegativeMaximum, std::log(Accumulation)}; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64) GetMlasPlatform().ComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters); #else @@ -922,7 +922,7 @@ Return Value: // float Parameters[] = {1.0f / Accumulation}; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64) GetMlasPlatform().ComputeSoftmaxOutputF32Kernel(Output, D, Parameters); #else MlasComputeSoftmaxOutputF32Kernel(Output, D, Parameters); diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 662e757a47998..1fa4c90913b24 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -352,7 +352,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE); // #if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || \ - defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X) + defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X) || \ + defined(MLAS_TARGET_RISCV64) typedef size_t @@ -1018,6 +1019,36 @@ extern "C" { MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelLasx; MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelLasx; MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelLasx; +#elif defined(MLAS_TARGET_RISCV64) +#if defined(MLAS_USE_RVV) + MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelRvv; + void MlasSgemmCopyPackBRvv( + float* D, + const float* B, + size_t ldb, + size_t CountX, + size_t CountY); +#endif + size_t MLASCALL MlasSgemmKernelZero( + const float* A, + const float* B, + float* C, + size_t CountK, + size_t CountM, + size_t CountN, + size_t lda, + size_t ldc, + float alpha); + size_t MLASCALL MlasSgemmKernelAdd( + const float* A, + const float* B, + float* C, + size_t CountK, + size_t CountM, + size_t CountN, + size_t lda, + size_t ldc, + float alpha); #else MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero; MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd; @@ -1167,6 +1198,12 @@ extern "C" { MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32Kernel; MLAS_REDUCE_MINIMUM_MAXIMUM_FLOAT_KERNEL MlasReduceMinimumMaximumF32Kernel; +#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) + MLAS_COMPUTE_SUMEXP_FLOAT_KERNEL MlasComputeSumExpF32KernelRvv; + MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelRvv; + MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelRvv; + MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelRvv; +#endif #if defined(MLAS_TARGET_AMD64) MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx; MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx512F; @@ -1442,7 +1479,7 @@ struct MLAS_PLATFORM { #endif -#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X) +#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X) || defined(MLAS_TARGET_RISCV64) MLAS_GEMM_FLOAT_KERNEL* GemmFloatKernel; #endif #if defined(MLAS_TARGET_LARCH64) @@ -1507,7 +1544,7 @@ struct MLAS_PLATFORM { MLAS_QUANTIZE_LINEAR_U4_KERNEL* QuantizeLinearU4Kernel; #endif -#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64) +#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_RISCV64) MLAS_COMPUTE_UNARY_FLOAT_KERNEL* ErfKernelRoutine; MLAS_COMPUTE_UNARY_FLOAT_KERNEL* LogisticKernelRoutine; MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL* ReduceMaximumF32Kernel; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index e9f140a2ee0f7..191ee1ab2f2f8 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -27,8 +27,10 @@ Module Name: #include "kleidiai/mlasi_kleidiai.h" #endif -#include +#include +#include #include +#include #if defined(MLAS_TARGET_POWER) #if defined(__linux__) @@ -49,6 +51,54 @@ Module Name: #include #endif +#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) && defined(__linux__) +#include +#include +#ifndef COMPAT_HWCAP_ISA_V +#define COMPAT_HWCAP_ISA_V (1UL << ('V' - 'A')) +#endif +#endif + +#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) +namespace { + +bool +MlasStringEqualsIgnoreCase( + const char* value, + const char* expected + ) +{ + while (*value != '\0' && *expected != '\0') { + const auto lhs = static_cast(*value); + const auto rhs = static_cast(*expected); + if (std::tolower(lhs) != std::tolower(rhs)) { + return false; + } + ++value; + ++expected; + } + + return *value == '\0' && *expected == '\0'; +} + +bool +MlasShouldForceScalarRiscv( + const char* value + ) +{ + if (value == nullptr || value[0] == '\0') { + return false; + } + + return MlasStringEqualsIgnoreCase(value, "1") || + MlasStringEqualsIgnoreCase(value, "true") || + MlasStringEqualsIgnoreCase(value, "on") || + MlasStringEqualsIgnoreCase(value, "yes"); +} + +} // namespace +#endif + #if defined(MLAS_TARGET_ARM64) #if defined(_WIN32) @@ -265,6 +315,33 @@ Return Value: this->CastF16ToF32Kernel = nullptr; this->CastF32ToF16Kernel = nullptr; +#if defined(MLAS_TARGET_RISCV64) + this->GemmFloatKernel = nullptr; + this->ErfKernelRoutine = MlasErfKernel; + this->LogisticKernelRoutine = MlasLogisticKernel; + this->ReduceMaximumF32Kernel = MlasReduceMaximumF32Kernel; + this->ComputeSumExpF32Kernel = MlasComputeSumExpF32Kernel; + this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel; + this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel; + +#if defined(MLAS_USE_RVV) + bool has_rvv = true; +#if defined(__linux__) + has_rvv = (getauxval(AT_HWCAP) & COMPAT_HWCAP_ISA_V) != 0; +#endif + if (MlasShouldForceScalarRiscv(std::getenv("ORT_MLAS_RISCV_FORCE_SCALAR"))) { + has_rvv = false; + } + if (has_rvv) { + this->GemmFloatKernel = MlasGemmFloatKernelRvv; + this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelRvv; + this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelRvv; + this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32KernelRvv; + this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32KernelRvv; + } +#endif +#endif + #if defined(MLAS_TARGET_AMD64_IX86) // diff --git a/onnxruntime/core/mlas/lib/riscv64/sgemm_kernel_rvv.cpp b/onnxruntime/core/mlas/lib/riscv64/sgemm_kernel_rvv.cpp new file mode 100644 index 0000000000000..c6e43e2c8bcd4 --- /dev/null +++ b/onnxruntime/core/mlas/lib/riscv64/sgemm_kernel_rvv.cpp @@ -0,0 +1,275 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + sgemm_kernel_rvv.cpp + +Abstract: + + This module implements an RVV kernel for the single precision matrix/matrix + multiply operation (SGEMM) on riscv64. + +--*/ + +#include "mlasi.h" + +#if defined(MLAS_USE_RVV) + +#include + +namespace { + +// The packed B layout stays 16 columns wide to match MLAS, but each tile is +// consumed in runtime-sized RVV chunks so the kernel is not tied to a fixed +// VLEN such as 128 or 256 bits. +constexpr size_t kPackedCountN = 16; + +template +MLAS_FORCEINLINE +void +MlasStoreAccumulatorRvv( + float* C, + vfloat32m4_t Accumulator, + size_t vl, + float alpha + ) +{ +#if defined(_WIN32) + + if constexpr (AlphaIsOne) { + UNREFERENCED_PARAMETER(alpha); + } + +#endif + + if constexpr (!AlphaIsOne) { + Accumulator = __riscv_vfmul_vf_f32m4(Accumulator, alpha, vl); + } + + if constexpr (!ZeroMode) { + Accumulator = __riscv_vfadd_vv_f32m4(Accumulator, __riscv_vle32_v_f32m4(C, vl), vl); + } + + __riscv_vse32_v_f32m4(C, Accumulator, vl); +} + +template +MLAS_FORCEINLINE +size_t +MlasSgemmKernelRvv( + const float* A, + const float* B, + float* C, + size_t CountK, + size_t CountN, + size_t lda, + size_t ldc, + float alpha + ) +{ + static_assert(Rows >= 1 && Rows <= 4, "unsupported RVV SGEMM tile height"); + +#if defined(_WIN32) + + if constexpr (Rows == 1) { + UNREFERENCED_PARAMETER(lda); + UNREFERENCED_PARAMETER(ldc); + } + + if constexpr (AlphaIsOne) { + UNREFERENCED_PARAMETER(alpha); + } + +#endif + + const float* packed_b_block = B; + float* c_block = C; + size_t remaining_n_total = CountN; + + do { + const size_t count_n_block = remaining_n_total >= kPackedCountN ? kPackedCountN : remaining_n_total; + size_t remaining_n_block = count_n_block; + size_t column_offset = 0; + float* c = c_block; + + while (remaining_n_block > 0) { + // Split a packed 16-column tile into however many lanes the current + // machine exposes for e32,m4. This keeps the kernel VLEN-agnostic. + const size_t vl = __riscv_vsetvl_e32m4(remaining_n_block); + vfloat32m4_t row0_block = __riscv_vfmv_v_f_f32m4(0.0f, vl); + vfloat32m4_t row1_block; + vfloat32m4_t row2_block; + vfloat32m4_t row3_block; + + if constexpr (Rows >= 2) { + row1_block = __riscv_vfmv_v_f_f32m4(0.0f, vl); + } + if constexpr (Rows >= 3) { + row2_block = __riscv_vfmv_v_f_f32m4(0.0f, vl); + } + if constexpr (Rows >= 4) { + row3_block = __riscv_vfmv_v_f_f32m4(0.0f, vl); + } + + const float* a = A; + const float* b = packed_b_block + column_offset; + size_t k = CountK; + + while (k >= 2) { + const float row0_a0 = a[0]; + const float row0_a1 = a[1]; + vfloat32m4_t b_elements = __riscv_vle32_v_f32m4(b, vl); + row0_block = __riscv_vfmacc_vf_f32m4(row0_block, row0_a0, b_elements, vl); + + if constexpr (Rows >= 2) { + row1_block = __riscv_vfmacc_vf_f32m4(row1_block, a[lda], b_elements, vl); + } + if constexpr (Rows >= 3) { + row2_block = __riscv_vfmacc_vf_f32m4(row2_block, a[lda * 2], b_elements, vl); + } + if constexpr (Rows >= 4) { + row3_block = __riscv_vfmacc_vf_f32m4(row3_block, a[lda * 3], b_elements, vl); + } + + b_elements = __riscv_vle32_v_f32m4(b + kPackedCountN, vl); + row0_block = __riscv_vfmacc_vf_f32m4(row0_block, row0_a1, b_elements, vl); + + if constexpr (Rows >= 2) { + row1_block = __riscv_vfmacc_vf_f32m4(row1_block, a[lda + 1], b_elements, vl); + } + if constexpr (Rows >= 3) { + row2_block = __riscv_vfmacc_vf_f32m4(row2_block, a[lda * 2 + 1], b_elements, vl); + } + if constexpr (Rows >= 4) { + row3_block = __riscv_vfmacc_vf_f32m4(row3_block, a[lda * 3 + 1], b_elements, vl); + } + + a += 2; + b += kPackedCountN * 2; + k -= 2; + } + + if (k > 0) { + vfloat32m4_t b_elements = __riscv_vle32_v_f32m4(b, vl); + row0_block = __riscv_vfmacc_vf_f32m4(row0_block, a[0], b_elements, vl); + + if constexpr (Rows >= 2) { + row1_block = __riscv_vfmacc_vf_f32m4(row1_block, a[lda], b_elements, vl); + } + if constexpr (Rows >= 3) { + row2_block = __riscv_vfmacc_vf_f32m4(row2_block, a[lda * 2], b_elements, vl); + } + if constexpr (Rows >= 4) { + row3_block = __riscv_vfmacc_vf_f32m4(row3_block, a[lda * 3], b_elements, vl); + } + } + + MlasStoreAccumulatorRvv(c, row0_block, vl, alpha); + + if constexpr (Rows >= 2) { + MlasStoreAccumulatorRvv(c + ldc, row1_block, vl, alpha); + } + if constexpr (Rows >= 3) { + MlasStoreAccumulatorRvv(c + ldc * 2, row2_block, vl, alpha); + } + if constexpr (Rows >= 4) { + MlasStoreAccumulatorRvv(c + ldc * 3, row3_block, vl, alpha); + } + + c += vl; + column_offset += vl; + remaining_n_block -= vl; + } + + c_block += count_n_block; + packed_b_block += CountK * kPackedCountN; + remaining_n_total -= count_n_block; + + } while (remaining_n_total > 0); + + return Rows; +} + +template +MLAS_FORCEINLINE +size_t +MlasGemmFloatKernelRvvDispatchRows( + const float* A, + const float* B, + float* C, + size_t CountK, + size_t CountM, + size_t CountN, + size_t lda, + size_t ldc, + float alpha + ) +{ + if (CountM >= 4) { + return MlasSgemmKernelRvv(A, B, C, CountK, CountN, lda, ldc, alpha); + } + + if (CountM == 3) { + return MlasSgemmKernelRvv(A, B, C, CountK, CountN, lda, ldc, alpha); + } + + if (CountM >= 2) { + return MlasSgemmKernelRvv(A, B, C, CountK, CountN, lda, ldc, alpha); + } + + return MlasSgemmKernelRvv(A, B, C, CountK, CountN, lda, ldc, alpha); +} + +template +MLAS_FORCEINLINE +size_t +MlasGemmFloatKernelRvvDispatch( + const float* A, + const float* B, + float* C, + size_t CountK, + size_t CountM, + size_t CountN, + size_t lda, + size_t ldc, + float alpha + ) +{ + if (alpha == 1.0f) { + return MlasGemmFloatKernelRvvDispatchRows( + A, B, C, CountK, CountM, CountN, lda, ldc, alpha); + } + + return MlasGemmFloatKernelRvvDispatchRows( + A, B, C, CountK, CountM, CountN, lda, ldc, alpha); +} + +} // namespace + +size_t +MLASCALL +MlasGemmFloatKernelRvv( + const float* A, + const float* B, + float* C, + size_t CountK, + size_t CountM, + size_t CountN, + size_t lda, + size_t ldc, + float alpha, + bool ZeroMode + ) +{ + if (ZeroMode) { + return MlasGemmFloatKernelRvvDispatch(A, B, C, CountK, CountM, CountN, lda, ldc, alpha); + } + + return MlasGemmFloatKernelRvvDispatch(A, B, C, CountK, CountM, CountN, lda, ldc, alpha); +} + +#endif // defined(MLAS_USE_RVV) diff --git a/onnxruntime/core/mlas/lib/riscv64/sgemm_pack_b_rvv.cpp b/onnxruntime/core/mlas/lib/riscv64/sgemm_pack_b_rvv.cpp new file mode 100644 index 0000000000000..b2ec24e3fbfdc --- /dev/null +++ b/onnxruntime/core/mlas/lib/riscv64/sgemm_pack_b_rvv.cpp @@ -0,0 +1,115 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + sgemm_pack_b_rvv.cpp + +Abstract: + + This module implements an RVV packing helper for the single precision + matrix/matrix multiply operation (SGEMM) on riscv64. + +--*/ + +#include "mlasi.h" + +#if defined(MLAS_USE_RVV) + +#include + +namespace { + +// Keep MLAS packing in 16-column tiles, but let RVV decide the actual chunk +// size at runtime via vsetvl so the same code works across different VLENs. +constexpr size_t kPackedCountN = 16; + +MLAS_FORCEINLINE +void +MlasStoreZeroPaddedBlock( + float* D, + const float* B, + size_t CountX + ) +{ + size_t remaining = kPackedCountN; + size_t offset = 0; + + while (remaining > 0) { + const size_t vl = __riscv_vsetvl_e32m4(remaining); + __riscv_vse32_v_f32m4(D + offset, __riscv_vfmv_v_f_f32m4(0.0f, vl), vl); + offset += vl; + remaining -= vl; + } + + remaining = CountX; + offset = 0; + + while (remaining > 0) { + const size_t vl = __riscv_vsetvl_e32m4(remaining); + __riscv_vse32_v_f32m4(D + offset, __riscv_vle32_v_f32m4(B + offset, vl), vl); + offset += vl; + remaining -= vl; + } +} + +MLAS_FORCEINLINE +void +MlasStoreFullBlock( + float* D, + const float* B + ) +{ + size_t remaining = kPackedCountN; + size_t offset = 0; + + while (remaining > 0) { + const size_t vl = __riscv_vsetvl_e32m4(remaining); + __riscv_vse32_v_f32m4(D + offset, __riscv_vle32_v_f32m4(B + offset, vl), vl); + offset += vl; + remaining -= vl; + } +} + +} // namespace + +void +MlasSgemmCopyPackBRvv( + float* D, + const float* B, + size_t ldb, + size_t CountX, + size_t CountY + ) +{ + while (CountX >= kPackedCountN) { + const float* b = B; + size_t y = CountY; + + do { + MlasStoreFullBlock(D, b); + D += kPackedCountN; + b += ldb; + y--; + } while (y > 0); + + B += kPackedCountN; + CountX -= kPackedCountN; + } + + if (CountX > 0) { + size_t y = CountY; + + do { + MlasStoreZeroPaddedBlock(D, B, CountX); + D += kPackedCountN; + B += ldb; + y--; + } while (y > 0); + } +} + +#endif // defined(MLAS_USE_RVV) diff --git a/onnxruntime/core/mlas/lib/riscv64/softmax_kernel_rvv.cpp b/onnxruntime/core/mlas/lib/riscv64/softmax_kernel_rvv.cpp new file mode 100644 index 0000000000000..dc548b56d676e --- /dev/null +++ b/onnxruntime/core/mlas/lib/riscv64/softmax_kernel_rvv.cpp @@ -0,0 +1,207 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + softmax_kernel_rvv.cpp + +Abstract: + + This module implements RVV kernels for the softmax critical path on + riscv64. The implementation keeps the scope intentionally small and + focuses on the float32 primitives used by Softmax and LogSoftmax: + reduction, sum-exp, normalization, and log-softmax output. + +--*/ + +#include "mlasi.h" + +#if defined(MLAS_USE_RVV) + +#include + +namespace { + +constexpr float kExpLowerRangeSumExp = -88.3762626647949f; +constexpr float kRoundingBias = MLAS_ROUNDING_BIAS_MAGIC; +constexpr float kLog2Reciprocal = 1.44269504088896341f; +constexpr float kLog2High = -6.93145752e-1f; +constexpr float kLog2Low = -1.42860677e-6f; +constexpr float kPoly0 = 0x1.694000p-10f; +constexpr float kPoly1 = 0x1.125edcp-7f; +constexpr float kPoly2 = 0x1.555b5ap-5f; +constexpr float kPoly3 = 0x1.555450p-3f; +constexpr float kPoly4 = 0x1.fffff6p-2f; +constexpr float kPoly56 = 0x1.000000p+0f; +constexpr int32_t kMaximumExponentBits = 0x3F800000; + +MLAS_FORCEINLINE +vfloat32m1_t +MlasComputeExpVectorRvv( + vfloat32m1_t value, + size_t vl + ) +{ + value = __riscv_vfmax_vf_f32m1(value, kExpLowerRangeSumExp, vl); + + vfloat32m1_t scaled = __riscv_vfmul_vf_f32m1(value, kLog2Reciprocal, vl); + vfloat32m1_t biased = __riscv_vfadd_vf_f32m1(scaled, kRoundingBias, vl); + vfloat32m1_t reduced_m = __riscv_vfsub_vf_f32m1(biased, kRoundingBias, vl); + vfloat32m1_t reduced = __riscv_vfadd_vv_f32m1( + __riscv_vfmul_vf_f32m1(reduced_m, kLog2High, vl), value, vl); + reduced = __riscv_vfadd_vv_f32m1( + __riscv_vfmul_vf_f32m1(reduced_m, kLog2Low, vl), reduced, vl); + + vfloat32m1_t poly = __riscv_vfmv_v_f_f32m1(kPoly0, vl); + poly = __riscv_vfadd_vf_f32m1( + __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly1, vl); + poly = __riscv_vfadd_vf_f32m1( + __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly2, vl); + poly = __riscv_vfadd_vf_f32m1( + __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly3, vl); + poly = __riscv_vfadd_vf_f32m1( + __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly4, vl); + poly = __riscv_vfadd_vf_f32m1( + __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly56, vl); + poly = __riscv_vfadd_vf_f32m1( + __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly56, vl); + + vint32m1_t exponent_bits = __riscv_vreinterpret_v_f32m1_i32m1(biased); + exponent_bits = __riscv_vsll_vx_i32m1(exponent_bits, 23, vl); + exponent_bits = __riscv_vadd_vx_i32m1(exponent_bits, kMaximumExponentBits, vl); + vfloat32m1_t scale = __riscv_vreinterpret_v_i32m1_f32m1(exponent_bits); + + return __riscv_vfmul_vv_f32m1(poly, scale, vl); +} + +MLAS_FORCEINLINE +float +MlasReduceSumRvv( + vfloat32m1_t value, + size_t vl + ) +{ + vfloat32m1_t accumulator = __riscv_vfmv_s_f_f32m1(0.0f, 1); + accumulator = __riscv_vfredusum_vs_f32m1_f32m1(value, accumulator, vl); + return __riscv_vfmv_f_s_f32m1_f32(accumulator); +} + +MLAS_FORCEINLINE +float +MlasReduceMaxRvv( + vfloat32m1_t value, + size_t vl + ) +{ + vfloat32m1_t accumulator = + __riscv_vfmv_s_f_f32m1(std::numeric_limits::lowest(), 1); + accumulator = __riscv_vfredmax_vs_f32m1_f32m1(value, accumulator, vl); + return __riscv_vfmv_f_s_f32m1_f32(accumulator); +} + +} // namespace + +float +MLASCALL +MlasReduceMaximumF32KernelRvv( + const float* Input, + size_t N + ) +{ + float maximum = std::numeric_limits::lowest(); + + while (N > 0) { + size_t vl = __riscv_vsetvl_e32m1(N); + vfloat32m1_t input = __riscv_vle32_v_f32m1(Input, vl); + input = __riscv_vfmax_vf_f32m1(input, maximum, vl); + maximum = MlasReduceMaxRvv(input, vl); + + Input += vl; + N -= vl; + } + + return maximum; +} + +float +MLASCALL +MlasComputeSumExpF32KernelRvv( + const float* Input, + float* Output, + size_t N, + const float* NegativeMaximum + ) +{ + const float negative_maximum = *NegativeMaximum; + float accumulation = 0.0f; + + while (N > 0) { + size_t vl = __riscv_vsetvl_e32m1(N); + vfloat32m1_t input = __riscv_vle32_v_f32m1(Input, vl); + vfloat32m1_t shifted = __riscv_vfadd_vf_f32m1(input, negative_maximum, vl); + vfloat32m1_t exp_value = MlasComputeExpVectorRvv(shifted, vl); + + if (Output != nullptr) { + __riscv_vse32_v_f32m1(Output, exp_value, vl); + Output += vl; + } + + accumulation += MlasReduceSumRvv(exp_value, vl); + + Input += vl; + N -= vl; + } + + return accumulation; +} + +void +MLASCALL +MlasComputeSoftmaxOutputF32KernelRvv( + float* Output, + size_t N, + const float* Parameters + ) +{ + const float scale = Parameters[0]; + + while (N > 0) { + size_t vl = __riscv_vsetvl_e32m1(N); + vfloat32m1_t output = __riscv_vle32_v_f32m1(Output, vl); + output = __riscv_vfmul_vf_f32m1(output, scale, vl); + __riscv_vse32_v_f32m1(Output, output, vl); + + Output += vl; + N -= vl; + } +} + +void +MLASCALL +MlasComputeLogSoftmaxOutputF32KernelRvv( + const float* Input, + float* Output, + size_t N, + const float* Parameters + ) +{ + const float negative_maximum = Parameters[0]; + const float logarithm = Parameters[1]; + + while (N > 0) { + size_t vl = __riscv_vsetvl_e32m1(N); + vfloat32m1_t input = __riscv_vle32_v_f32m1(Input, vl); + input = __riscv_vfadd_vf_f32m1(input, negative_maximum, vl); + input = __riscv_vfsub_vf_f32m1(input, logarithm, vl); + __riscv_vse32_v_f32m1(Output, input, vl); + + Input += vl; + Output += vl; + N -= vl; + } +} + +#endif // defined(MLAS_USE_RVV) diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp index 7836b1f89b0c4..88d0308bfa21e 100644 --- a/onnxruntime/core/mlas/lib/sgemm.cpp +++ b/onnxruntime/core/mlas/lib/sgemm.cpp @@ -247,6 +247,13 @@ Return Value: --*/ { +#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) && !defined(FORCE_GENERIC_ALGORITHMS) + if (GetMlasPlatform().GemmFloatKernel != nullptr) { + MlasSgemmCopyPackBRvv(D, B, ldb, CountX, CountY); + return; + } +#endif + // // Copy data from matrix B into the destination buffer 16 columns at a // time. @@ -1004,6 +1011,14 @@ Return Value: #if (defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X) || defined(MLAS_TARGET_LARCH64)) && !defined(FORCE_GENERIC_ALGORITHMS) RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode); +#elif defined(MLAS_TARGET_RISCV64) && !defined(FORCE_GENERIC_ALGORITHMS) + if (GetMlasPlatform().GemmFloatKernel != nullptr) { + RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode); + } else if (ZeroMode) { + RowsHandled = MlasSgemmKernelZero(A, B, C, CountK, CountM, CountN, lda, ldc, alpha); + } else { + RowsHandled = MlasSgemmKernelAdd(A, B, C, CountK, CountM, CountN, lda, ldc, alpha); + } #else if (ZeroMode) { RowsHandled = MlasSgemmKernelZero(A, B, C, CountK, CountM, CountN, lda, ldc, alpha); diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc index aeddef0c5188f..28d6332f6282c 100644 --- a/onnxruntime/core/platform/posix/env.cc +++ b/onnxruntime/core/platform/posix/env.cc @@ -618,7 +618,17 @@ class PosixEnv : public Env { PosixEnv() { cpuinfo_available_ = cpuinfo_initialize(); if (!cpuinfo_available_) { - LOGS_DEFAULT(INFO) << "cpuinfo_initialize failed"; + // PosixEnv may be constructed before the logging system is initialized + // (e.g. via a static Env::Default() reference in the Python bindings). + // Using LOGS_DEFAULT here would crash with "Attempt to use DefaultLogger + // but none has been registered". Fall back to stderr when no logger exists. + if (logging::LoggingManager::HasDefaultLogger()) { + LOGS_DEFAULT(WARNING) << "cpuinfo_initialize failed. " + "May cause CPU EP performance degradation due to undetected CPU features."; + } else { + std::cerr << "onnxruntime warning: cpuinfo_initialize failed. " + "May cause CPU EP performance degradation due to undetected CPU features.\n"; + } } } bool cpuinfo_available_{false}; diff --git a/onnxruntime/core/providers/coreml/builders/impl/quick_gelu_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/quick_gelu_op_builder.cc new file mode 100644 index 0000000000000..2aa5d82d3f198 --- /dev/null +++ b/onnxruntime/core/providers/coreml/builders/impl/quick_gelu_op_builder.cc @@ -0,0 +1,128 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include + +#include "core/providers/common.h" +#include "core/providers/coreml/builders/helper.h" +#include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" +#include "core/providers/coreml/builders/model_builder.h" +#include "core/providers/coreml/builders/op_builder_factory.h" +#include "core/providers/coreml/shape_utils.h" +#include "core/providers/shared/utils/utils.h" + +namespace onnxruntime { +namespace coreml { + +// com.microsoft:QuickGelu is produced by ORT's QuickGeluFusion pass +// (onnxruntime/core/optimizer/quick_gelu_fusion.cc) at optimization level +// ORT_ENABLE_EXTENDED and above. The schema in contrib_defs.cc defines it as +// Y = X * Sigmoid(alpha * X) default alpha = 1.702 +// CoreML has no native equivalent, so we decompose to three MIL ops — all +// primitives are already CoreML-supported. Same approach the QNN EP uses +// in qnn/builder/opbuilder/quick_gelu_op_builder.cc. +class QuickGeluOpBuilder : public BaseOpBuilder { + Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const override; + + bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } +}; + +Status QuickGeluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, + const Node& node, + const logging::Logger& logger) const { + // IsOpSupportedImpl gates this, but fail fast rather than silently produce an + // invalid model if the path is ever reached without MLProgram. + ORT_RETURN_IF_NOT(model_builder.CreateMLProgram(), + "QuickGelu is only supported by the CoreML EP in MLProgram format"); + + NodeAttrHelper helper(node); + const float alpha = helper.Get("alpha", 1.702f); + + const auto input_dtype = node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); + const int32_t elem_type = static_cast(input_dtype); + const std::string& x_name = node.InputDefs()[0]->Name(); + + std::vector x_shape; + ORT_RETURN_IF_NOT(GetShape(*node.InputDefs()[0], x_shape, logger), "Failed to get QuickGelu input shape"); + + { + using namespace CoreML::Specification::MILSpec; + + // When alpha ≈ 1.0 (e.g. CLIP's approximate GELU, `x * sigmoid(x)`), skip + // the leading mul and feed x straight into sigmoid. Saves one op and + // avoids the rounding it would introduce. Mirrors QNN's builder at + // qnn/builder/opbuilder/quick_gelu_op_builder.cc:42-49. + constexpr float kAlphaEpsilon = 1e-6f; + const bool skip_alpha_mul = std::abs(alpha - 1.0f) < kAlphaEpsilon; + + std::string sigmoid_input_name = x_name; + std::unique_ptr mul_alpha; + if (!skip_alpha_mul) { + // alpha_x = mul(x, alpha) + mul_alpha = model_builder.CreateOperation(node, "mul", "alpha"); + AddOperationInput(*mul_alpha, "x", x_name); + if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { + AddOperationInput(*mul_alpha, "y", model_builder.AddScalarConstant(mul_alpha->type(), "alpha", alpha)); + } else { + AddOperationInput(*mul_alpha, "y", + model_builder.AddScalarConstant(mul_alpha->type(), "alpha", MLFloat16(alpha))); + } + sigmoid_input_name = model_builder.GetUniqueName(node, "quick_gelu_alpha_x"); + AddIntermediateOperationOutput(*mul_alpha, sigmoid_input_name, elem_type, x_shape); + } + + // sig = sigmoid(sigmoid_input) + auto sig = model_builder.CreateOperation(node, "sigmoid"); + AddOperationInput(*sig, "x", sigmoid_input_name); + const std::string& sig_name = model_builder.GetUniqueName(node, "quick_gelu_sigmoid"); + AddIntermediateOperationOutput(*sig, sig_name, elem_type, x_shape); + + // y = mul(x, sig) + auto mul_final = model_builder.CreateOperation(node, "mul", "final"); + AddOperationInput(*mul_final, "x", x_name); + AddOperationInput(*mul_final, "y", sig_name); + AddOperationOutput(*mul_final, *node.OutputDefs()[0]); + + if (mul_alpha) { + model_builder.AddOperation(std::move(mul_alpha)); + } + model_builder.AddOperation(std::move(sig)); + model_builder.AddOperation(std::move(mul_final)); + } + + return Status::OK(); +} + +bool QuickGeluOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const { + // Only the MLProgram path is implemented. NeuralNetwork format is deprecated + // on Apple Silicon and not worth carrying a second implementation for. + if (!input_params.create_mlprogram) { + LOGS(logger, VERBOSE) << "QuickGelu: only MLProgram format is supported by the CoreML EP"; + return false; + } + + // AddToModelBuilderImpl requires the input shape to size intermediate MIL + // outputs, so check here and fall back to CPU if shape inference was + // incomplete — don't claim the node and then fail at model-build time. + std::vector x_shape; + if (!GetShape(*node.InputDefs()[0], x_shape, logger)) { + LOGS(logger, VERBOSE) << "QuickGelu: failed to get input shape"; + return false; + } + + return true; +} + +void CreateQuickGeluOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { + op_registrations.builders.push_back(std::make_unique()); + op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); +} + +} // namespace coreml +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc index cc301aceae466..d4f14273eeef5 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc @@ -26,6 +26,9 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateActivationOpBuilder("Elu", op_registrations); CreateActivationOpBuilder("HardSigmoid", op_registrations); + // Microsoft-domain ops produced by ORT's own optimizer passes + CreateQuickGeluOpBuilder("QuickGelu", op_registrations); + // Unary ops CreateUnaryOpBuilder("Erf", op_registrations); CreateUnaryOpBuilder("Reciprocal", op_registrations); diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h index 9b51b53d73e9e..f6304848274de 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h @@ -44,6 +44,7 @@ void CreateSplitOpBuilder(const std::string& op_type, OpBuilderRegistrations& op void CreateSqueezeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); +void CreateQuickGeluOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); } // namespace coreml } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc index 45c0a2fadc2ba..1a35c24c69676 100644 --- a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc +++ b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc @@ -39,9 +39,7 @@ LinearClassifier::LinearClassifier(const OpKernelInfo& info) class_count_ = static_cast(intercepts_.size()); ORT_ENFORCE(class_count_ > 0, "LinearClassifier: intercepts must not be empty."); - ORT_ENFORCE(coefficients_.size() % static_cast(class_count_) == 0, - "LinearClassifier: coefficients size (", coefficients_.size(), - ") must be a multiple of the number of classes (", class_count_, ")."); + ORT_ENFORCE(!coefficients_.empty(), "LinearClassifier: coefficients must not be empty."); SetupMlasBackendKernelSelectorFromConfigOptions(mlas_backend_kernel_selector_config_, info.GetConfigOptions()); } @@ -156,12 +154,12 @@ Status LinearClassifier::Compute(OpKernelContext* ctx) const { if (!SafeMultiply(static_cast(class_count_), static_cast(num_features), expected_coefficients_size)) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "class_count (", class_count_, ") * num_features (", num_features, - ") overflows size_t"); + "LinearClassifier: class_count (", class_count_, + ") * num_features (", num_features, ") overflows size_t"); } ORT_RETURN_IF_NOT(coefficients_.size() >= expected_coefficients_size, - "coefficients size (", coefficients_.size(), ") is less than class_count (", class_count_, - ") * num_features (", num_features, ")"); + "LinearClassifier: coefficients size (", coefficients_.size(), + ") is less than class_count (", class_count_, ") * num_features (", num_features, ")."); Tensor* Y = ctx->Output(0, {num_batches}); diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc index 65bcad2be8a24..9d9808ef248f6 100644 --- a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc +++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc @@ -46,36 +46,44 @@ SVMClassifier::SVMClassifier(const OpKernelInfo& info) feature_count_ = 0; class_count_ = 0; for (size_t i = 0; i < vectors_per_class_.size(); i++) { + ORT_ENFORCE(vectors_per_class_[i] >= 0, + "vectors_per_class[", i, "] must be non-negative. Got ", vectors_per_class_[i]); starting_vector_.push_back(vector_count_); - vector_count_ += narrow(vectors_per_class_[i]); + vector_count_ += onnxruntime::narrow(vectors_per_class_[i]); } + ORT_ENFORCE(classlabels_strings_.size() > 0 || classlabels_ints_.size() > 0, "One of classlabels_strings, classlabels_ints is required."); + using_strings_ = false; if (classlabels_strings_.size() > 0) { using_strings_ = true; class_count_ = classlabels_strings_.size(); - } else if (classlabels_ints_.size() > 0) { - class_count_ = classlabels_ints_.size(); } else { - class_count_ = 1; + class_count_ = classlabels_ints_.size(); } + ORT_ENFORCE(class_count_ < 65536, "The number of classes ", class_count_, " is beyond what this kernel supports (65535)."); + ORT_ENFORCE(proba_.size() == probb_.size(), "proba and probb must have the same size."); + ORT_ENFORCE(coefficients_.size() > 0, "coefficients are empty."); + if (vector_count_ > 0) { feature_count_ = support_vectors_.size() / vector_count_; // length of each support vector mode_ = SVM_TYPE::SVM_SVC; + ORT_ENFORCE(vectors_per_class_.size() == class_count_, "Mismatch between classlabels_ints/classlabels_strings and vectors_per_class dimensions."); } else { feature_count_ = coefficients_.size() / class_count_; // liblinear mode mode_ = SVM_TYPE::SVM_LINEAR; set_kernel_type(KERNEL::LINEAR); } - ORT_ENFORCE(classlabels_strings_.size() > 0 || classlabels_ints_.size() > 0); - ORT_ENFORCE(proba_.size() == probb_.size()); - ORT_ENFORCE(coefficients_.size() > 0); - // Validate attribute array sizes against the declared dimensions to prevent // out-of-bounds reads from crafted models. if (mode_ == SVM_TYPE::SVM_SVC) { + ORT_ENFORCE(vectors_per_class_.size() == static_cast(class_count_), + "vectors_per_class attribute size (", vectors_per_class_.size(), + ") must match class_count (", class_count_, ")."); + ORT_ENFORCE(vector_count_ > 0, "vector_count must be greater than 0 in SVC mode."); + // SVC mode: coefficients layout is [class_count - 1, vector_count] size_t expected_coefficients = 0; if (!SafeMultiply(static_cast(class_count_ - 1), static_cast(vector_count_), @@ -87,6 +95,9 @@ SVMClassifier::SVMClassifier(const OpKernelInfo& info) "coefficients attribute size (", coefficients_.size(), ") is smaller than expected (", expected_coefficients, ") for the given class_count and vector_count."); + ORT_ENFORCE(support_vectors_.size() % static_cast(vector_count_) == 0, + "support_vectors attribute size (", support_vectors_.size(), + ") must be divisible by vector_count (", vector_count_, ")."); // rho needs one entry per classifier pair: class_count * (class_count - 1) / 2 size_t num_classifiers = 0; @@ -121,7 +132,7 @@ SVMClassifier::SVMClassifier(const OpKernelInfo& info) } template -static void ChooseClass(Tensor& output, const int64_t output_idx, float max_weight, const int64_t maxclass, +static void ChooseClass(Tensor& output, const int64_t output_idx, float max_weight, const size_t maxclass, bool have_proba, bool weights_are_all_positive, const std::vector& classlabels, const LabelType& posclass, const LabelType& negclass) { @@ -134,9 +145,9 @@ static void ChooseClass(Tensor& output, const int64_t output_idx, float max_weig else if (max_weight > 0 && !weights_are_all_positive) output_data = classlabels[1]; else - output_data = classlabels[onnxruntime::narrow(maxclass)]; + output_data = classlabels[maxclass]; } else { - output_data = classlabels[onnxruntime::narrow(maxclass)]; + output_data = classlabels[maxclass]; } } else if (max_weight > 0) { output_data = posclass; @@ -209,7 +220,7 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx, const ptrdiff_t num_batches = SafeInt(input_rank == 1 ? 1 : x_shape[0]); const ptrdiff_t num_features = input_rank == 1 ? narrow(x_shape[0]) : narrow(x_shape[1]); - ORT_RETURN_IF_NOT(num_features == feature_count_ && num_features >= 0 && num_batches >= 0, + ORT_RETURN_IF_NOT(num_features == static_cast(feature_count_) && num_features >= 0 && num_batches >= 0, "Invalid input for SVMClassifier: expected feature_count=", feature_count_, ", actual num_features=", num_features, ", input_rank=", input_rank, @@ -241,11 +252,11 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx, // Total number of classifiers comparing pairs between the classes // e.g. if you have A, B C and D classes, the number of classifiers to compare between each pair is 6 // with AB, AC, AD, BC, BD and CD - const int64_t num_classifiers = class_count_ * (class_count_ - 1) / 2; // == (class_count_-1)! - const int64_t class_count_squared = class_count_ * class_count_; + const size_t num_classifiers = class_count_ * (class_count_ - 1) / 2; // == (class_count_-1)! + const size_t class_count_squared = class_count_ * class_count_; const bool have_proba = proba_.size() > 0; - int64_t final_scores_per_batch = class_count_; + size_t final_scores_per_batch = class_count_; if (mode_ == SVM_TYPE::SVM_SVC && !have_proba) { if (class_count_ > 2) final_scores_per_batch = num_classifiers; @@ -261,7 +272,7 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx, // both outputs are required so can't be nullptr Tensor& Y = *ctx.Output(0, {num_batches}); - Tensor& Z = *ctx.Output(1, {num_batches, final_scores_per_batch}); + Tensor& Z = *ctx.Output(1, {num_batches, static_cast(final_scores_per_batch)}); auto final_scores = Z.MutableDataAsSpan(); @@ -276,7 +287,7 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx, } int write_additional_scores = -1; - int64_t num_scores_per_batch = class_count_; + size_t num_scores_per_batch = class_count_; if (mode_ == SVM_TYPE::SVM_SVC && !have_proba) { num_scores_per_batch = num_classifiers; @@ -346,39 +357,39 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx, // e.g. AB combines with BA. // If A has 3 support vectors and B has 2, there's a 3x2 block for AB and a 2x3 block for BA to combine - auto cur_kernels = kernels_span.subspan(n * SafeInt(vector_count_), onnxruntime::narrow(vector_count_)); - auto cur_scores = classifier_scores.subspan(n * SafeInt(num_slots_per_iteration), onnxruntime::narrow(num_classifiers)); - auto cur_votes = votes_span.subspan(n * SafeInt(class_count_), onnxruntime::narrow(class_count_)); + auto cur_kernels = kernels_span.subspan(n * SafeInt(vector_count_), vector_count_); + auto cur_scores = classifier_scores.subspan(n * SafeInt(num_slots_per_iteration), num_classifiers); + auto cur_votes = votes_span.subspan(n * SafeInt(class_count_), class_count_); auto scores_iter = cur_scores.begin(); size_t classifier_idx = 0; - for (int64_t i = 0; i < class_count_ - 1; i++) { - int64_t start_index_i = starting_vector_[onnxruntime::narrow(i)]; // start of support vectors for class i - int64_t class_i_support_count = vectors_per_class_[onnxruntime::narrow(i)]; - int64_t i_coeff_row_offset = vector_count_ * i; + for (size_t i = 0; i < class_count_ - 1; i++) { + size_t start_index_i = starting_vector_[i]; // start of support vectors for class i + size_t class_i_support_count = onnxruntime::narrow(vectors_per_class_[i]); + size_t i_coeff_row_offset = vector_count_ * i; - for (int64_t j = i + 1; j < class_count_; j++) { - int64_t start_index_j = starting_vector_[onnxruntime::narrow(j)]; // start of support vectors for class j - int64_t class_j_support_count = vectors_per_class_[onnxruntime::narrow(j)]; - int64_t j_coeff_row_offset = vector_count_ * (j - 1); + for (size_t j = i + 1; j < class_count_; j++) { + size_t start_index_j = starting_vector_[j]; // start of support vectors for class j + size_t class_j_support_count = onnxruntime::narrow(vectors_per_class_[j]); + size_t j_coeff_row_offset = vector_count_ * (j - 1); double sum = 0; - const float* val1 = &(coefficients_[j_coeff_row_offset + SafeInt(start_index_i)]); - const float* val2 = &(cur_kernels[onnxruntime::narrow(start_index_i)]); - for (int64_t m = 0; m < class_i_support_count; ++m, ++val1, ++val2) + const float* val1 = coefficients_.data() + (j_coeff_row_offset + start_index_i); + const float* val2 = cur_kernels.data() + start_index_i; + for (size_t m = 0; m < class_i_support_count; ++m, ++val1, ++val2) sum += *val1 * *val2; - val1 = &(coefficients_[i_coeff_row_offset + SafeInt(start_index_j)]); - val2 = &(cur_kernels[onnxruntime::narrow(start_index_j)]); + val1 = coefficients_.data() + (i_coeff_row_offset + start_index_j); + val2 = cur_kernels.data() + start_index_j; - for (int64_t m = 0; m < class_j_support_count; ++m, ++val1, ++val2) + for (size_t m = 0; m < class_j_support_count; ++m, ++val1, ++val2) sum += *val1 * *val2; sum += rho_[classifier_idx++]; *scores_iter++ = static_cast(sum); - ++(cur_votes[onnxruntime::narrow(sum > 0 ? i : j)]); + ++(cur_votes[sum > 0 ? i : j]); } } } @@ -389,23 +400,23 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx, &classifier_scores_data, num_classifiers, &votes_data, &Y, num_scores_per_batch, write_additional_scores](ptrdiff_t idx) { int n = SafeInt(idx); // convert to a usable sized type - auto cur_scores = final_scores.subspan(n * SafeInt(final_scores_per_batch), onnxruntime::narrow(final_scores_per_batch)); + auto cur_scores = final_scores.subspan(n * SafeInt(final_scores_per_batch), final_scores_per_batch); if (mode_ == SVM_TYPE::SVM_SVC && have_proba) { - auto probsp2 = gsl::make_span(probsp2_data.data() + (n * class_count_squared), onnxruntime::narrow(class_count_squared)); + auto probsp2 = gsl::make_span(probsp2_data.data() + (n * class_count_squared), class_count_squared); float* classifier_scores = classifier_scores_data.data() + (n * num_classifiers); size_t index = 0; - for (int64_t i = 0; i < class_count_ - 1; ++i) { - int64_t p1 = i * class_count_ + i + 1; - int64_t p2 = (i + 1) * class_count_ + i; - for (int64_t j = i + 1; j < class_count_; ++j, ++index) { + for (size_t i = 0; i < class_count_ - 1; ++i) { + size_t p1 = i * class_count_ + i + 1; + size_t p2 = (i + 1) * class_count_ + i; + for (size_t j = i + 1; j < class_count_; ++j, ++index) { float val1 = sigmoid_probability(classifier_scores[index], proba_[index], probb_[index]); float val2 = std::max(val1, 1.0e-7f); val2 = std::min(val2, 1 - 1.0e-7f); - probsp2[onnxruntime::narrow(p1)] = val2; - probsp2[onnxruntime::narrow(p2)] = 1 - val2; + probsp2[p1] = val2; + probsp2[p2] = 1 - val2; ++p1; p2 += class_count_; } @@ -431,10 +442,10 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx, // onnx specs expects one column per class. if (num_classifiers == 1) { // binary case if (using_strings_) { - ChooseClass(Y, n, max_weight, maxclass, have_proba, weights_are_all_positive_, + ChooseClass(Y, n, max_weight, onnxruntime::narrow(maxclass), have_proba, weights_are_all_positive_, classlabels_strings_, "1", "0"); } else { - ChooseClass(Y, n, max_weight, maxclass, have_proba, weights_are_all_positive_, + ChooseClass(Y, n, max_weight, onnxruntime::narrow(maxclass), have_proba, weights_are_all_positive_, classlabels_ints_, 1, 0); } } else { // multiclass diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.h b/onnxruntime/core/providers/cpu/ml/svmclassifier.h index e392d0915db68..4d7ed089089f2 100644 --- a/onnxruntime/core/providers/cpu/ml/svmclassifier.h +++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.h @@ -121,12 +121,12 @@ class SVMClassifier final : public OpKernel, private SVMCommon { Status ComputeImpl(OpKernelContext& ctx, gsl::span x_data, const TensorShape& x_shape) const; bool weights_are_all_positive_; - ptrdiff_t feature_count_; - ptrdiff_t class_count_; - ptrdiff_t vector_count_; + size_t feature_count_; + size_t class_count_; + size_t vector_count_; bool using_strings_; std::vector vectors_per_class_; - std::vector starting_vector_; + std::vector starting_vector_; std::vector rho_; std::vector proba_; std::vector probb_; diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 28aedc0faae61..37306d97b06ab 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -491,7 +491,8 @@ void DataOps::populate_op_mode_supported() { } { UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, - V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1, V_2025_2, V_2025_3, V_2025_4}, + V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1, + V_2025_2, V_2025_3, V_2025_4, V_2026_0, V_2026_1}, [this](const Node* node, const InitializedTensorSet&) { auto& attributes = node->GetAttributes(); if (attributes.count("coordinate_transformation_mode") > 0) { diff --git a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc index ee46c76f1ea54..234b1d54e69c5 100644 --- a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc +++ b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc @@ -83,6 +83,13 @@ Status RotaryEmbedding::ComputeInternal(ComputeContext& context) const { if (position_ids != nullptr) { // position_ids provided: cos/sin cache is 2D (max_pos, D/2) + // position_ids bounds validation is handled by shader-side defense-in-depth checks + // (OOB position_ids → pass-through input unchanged). Host-side value scanning is not possible + // because WebGPU program inputs must be GPU buffers (InputMemoryType(OrtMemTypeCPUInput) is + // incompatible with AddInputs). + // Note: ONNX RotaryEmbedding has no base-offset mode (format 0) — position_ids is always + // a 2D tensor (batch_size, sequence_length) when provided. + contrib::webgpu::RotaryEmbeddingProgram program{interleaved_}; program .CacheHint(interleaved_) diff --git a/onnxruntime/core/providers/webgpu/math/gemm_packed.cc b/onnxruntime/core/providers/webgpu/math/gemm_packed.cc index 79a4f1f73902b..96fe712a41b40 100644 --- a/onnxruntime/core/providers/webgpu/math/gemm_packed.cc +++ b/onnxruntime/core/providers/webgpu/math/gemm_packed.cc @@ -34,9 +34,9 @@ Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const { MatMulReadFnSource(shader, a, b, nullptr, transA_, transB_); } if (is_vec4_) { - ORT_RETURN_IF_ERROR(MakeMatMulPackedVec4Source(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, nullptr, transA_, transB_, alpha_, need_handle_matmul_, output_components_, /*tile_inner*/ 32, need_split_k, split_dim_inner_)); + ORT_RETURN_IF_ERROR(MakeMatMulPackedVec4Source(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, transA_, transB_, alpha_, need_handle_matmul_, output_components_, /*tile_inner*/ 32, need_split_k, split_dim_inner_)); } else { - ORT_RETURN_IF_ERROR(MakeMatMulPackedSource(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, nullptr, transA_, transB_, alpha_, need_handle_matmul_)); + ORT_RETURN_IF_ERROR(MakeMatMulPackedSource(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, transA_, transB_, alpha_, need_handle_matmul_)); } const ShaderVariableHelper* c = nullptr; diff --git a/onnxruntime/core/providers/webgpu/math/gemm_utils.cc b/onnxruntime/core/providers/webgpu/math/gemm_utils.cc index 573d7b016310f..b762c383a7c3f 100644 --- a/onnxruntime/core/providers/webgpu/math/gemm_utils.cc +++ b/onnxruntime/core/providers/webgpu/math/gemm_utils.cc @@ -309,13 +309,27 @@ Status MakeMatMulPackedVec4Source(ShaderHelper& shader, // atomic built-in functions in `HandleMatMulWithSplitK()`. shader.MainFunctionBody() << "const kSplitK = " << split_dim_inner << ";\n" - << " let num_tiles = (kSplitK - 1) / tileInner + 1;\n" - << " var kStart = kSplitK * i32(logical_global_id.z);\n" - - // When Split-K is used, `batch` should always be 0 and `logical_global_id.z` is used to indicate - // the index of split-k instead of batch. - << " let batch = 0;\n" - << " let batchIndices = 0u;\n"; + << " let num_tiles = (kSplitK - 1) / tileInner + 1;\n"; + if (nullptr != batch_dims) { + // With Split-K and batch (in MatMul and Conv2D|MatMul), `dispatch_z` is + // `splits_per_batch * batch_size`, and `logical_global_id.z` encodes both the + // batch index and the Split-K index within that range. + // We decompose it as: + // split_index = logical_global_id.z % splits_per_batch + // batch = logical_global_id.z / splits_per_batch + shader.MainFunctionBody() + << " let splits_per_batch = uniforms.splits_per_batch;\n" + << " let split_index = i32(logical_global_id.z) % i32(splits_per_batch);\n" + << " var kStart = kSplitK * split_index;\n" + << " let batch = i32(logical_global_id.z) / i32(splits_per_batch);\n" + << " let batchIndices = " << batch_dims->OffsetToIndices("u32(batch)") << ";\n"; + } else { + // With Split-K without batch (in Gemm), `logical_global_id.z` is exactly the Split-K index. + shader.MainFunctionBody() + << " var kStart = kSplitK * i32(logical_global_id.z);\n" + << " let batch = 0;\n" + << " let batchIndices = 0u;\n"; + } } else { shader.MainFunctionBody() << " let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n" diff --git a/onnxruntime/core/providers/webgpu/math/matmul.cc b/onnxruntime/core/providers/webgpu/math/matmul.cc index af488f2c23a30..512a3d05c09eb 100644 --- a/onnxruntime/core/providers/webgpu/math/matmul.cc +++ b/onnxruntime/core/providers/webgpu/math/matmul.cc @@ -2,6 +2,9 @@ // Licensed under the MIT License. #include "core/providers/webgpu/math/matmul.h" + +#include + #include "core/common/inlined_containers.h" #include "core/providers/cpu/tensor/utils.h" #include "core/providers/webgpu/shader_helper.h" @@ -188,18 +191,18 @@ Status ComputeMatMul(ComputeContext* context, TensorShape output_shape = helper.OutputShape(); - const int64_t dim_output_outer = output_shape[output_shape.NumDimensions() - 2]; - // check if A is batch of vector (bach is not 1, M is 1) and B is a matrix (batch is 1) - if (batchA != 1 && dim_output_outer == 1 && batchB == 1) { - // optimization for batched vector matrix multiplication - // dimensions of A: [1,`batchA`,K] - TensorShapeVector dims_a = {1, batchA, helper.K()}; + // When B is a matrix (batch is 1), we fold batchA into the M dimension for better + // performance (e.g., [2,3,5] → [1,6,5]). + if (batchA != 1 && batchB == 1) { + // dimensions of A: [1,`batchA`, M, K] + int64_t batchAndM = a_shape.SizeToDimension(a_shape.NumDimensions() - 1); + TensorShapeVector dims_a = {1, batchAndM, helper.K()}; // dimensions of B: [1,K,N] TensorShapeVector dims_b = {1, helper.K(), helper.N()}; a_shape = TensorShape(dims_a); b_shape = TensorShape(dims_b); - output_shape = {1, batchA, helper.N()}; + output_shape = {1, batchAndM, helper.N()}; } // helpful dimension variables @@ -244,13 +247,13 @@ Status ComputeMatMul(ComputeContext* context, const Tensor* bias = has_bias ? inputs[2] : nullptr; bool use_bias_in_matmul = has_bias; uint32_t split_dim_inner = 1; + uint32_t splits_per_batch = 1; // Current Split-K implementation relies on atomic operations, which are not deterministic. if (!context->KernelContext().GetUseDeterministicCompute()) { const SplitKConfig& split_k_config = context->GetSplitKConfig(); const bool need_split_k = split_k_config.UseSplitK(is_vec4, activation.activation_kind_, batch_size, dim_a_outer, dim_b_outer, dim_inner, is_channels_last); if (need_split_k) { - ORT_ENFORCE(batch_size == 1, "Split-K MatMul only supports batch_size == 1."); ORT_ENFORCE(is_vec4, "Split-K MatMul requires vec4 packing."); if (has_bias) { @@ -258,17 +261,21 @@ Status ComputeMatMul(ComputeContext* context, } // Initialize `output_tensor` with 0 or bias before MatMulProgram with Split-K enabled. - const auto fill_bias_program = CreateMatMulFillBiasOrZeroBeforeSplitKProgram(bias, output_tensor, /*is_gemm*/ false, /*beta*/ 1.0f, /*bias_components*/ 4, output_shape_temp); + const auto fill_bias_program = CreateMatMulFillBiasOrZeroBeforeSplitKProgram(bias, output_tensor, /*is_gemm*/ false, /*beta*/ 1.0f, /*bias_components*/ 4, output_shape_temp, narrow(batch_size)); ORT_RETURN_IF_ERROR(context->RunProgram(fill_bias_program)); // `bias` has been handled in the execution of `fill_bias_program` so we don't need to set // `bias` again in `MatMulProgram`. use_bias_in_matmul = false; - // With Split-K, `dim_inner` will be split into multiple parts and `dispatch_z` will be the - // number of splits along `dim_inner`. + // With Split-K, `dim_inner` will be split into multiple parts. `dispatch_z` encodes + // both the split-k index and the batch index: dispatch_z = splits_per_batch * batch_size. split_dim_inner = split_k_config.GetSplitDimInner(); - dispatch_z = (dim_inner + split_dim_inner - 1) / split_dim_inner; + splits_per_batch = (dim_inner + split_dim_inner - 1) / split_dim_inner; + const uint64_t dispatch_z_u64 = static_cast(batch_size) * static_cast(splits_per_batch); + ORT_ENFORCE(dispatch_z_u64 <= static_cast(std::numeric_limits::max()), + "dispatch_z exceeds uint32_t range: ", dispatch_z_u64); + dispatch_z = narrow(dispatch_z_u64); // The output should be declared in atomic types in `MatMulProgram` for the use of atomic // built-in functions. @@ -281,7 +288,7 @@ Status ComputeMatMul(ComputeContext* context, .CacheHint(activation.ToString(), absl::StrJoin(elements_per_thread, "-"), std::to_string(is_vec4), components, is_channels_last, split_dim_inner) .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, a_shape_temp, components}, {b, ProgramTensorMetadataDependency::TypeAndRank, b_shape_temp, components}}) - .AddUniformVariables({{dim_a_outer}, {dim_b_outer}, {dim_inner}, {dispatch_x}, {dispatch_y}, {dispatch_z}}) + .AddUniformVariables({{dim_a_outer}, {dim_b_outer}, {dim_inner}, {dispatch_x}, {dispatch_y}, {dispatch_z}, {splits_per_batch}}) .AddIndices(outer_dims) .SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z) .SetWorkgroupSize(MatMul::MATMUL_PACKED_WORKGROUP_SIZE_X, MatMul::MATMUL_PACKED_WORKGROUP_SIZE_Y, MatMul::MATMUL_PACKED_WORKGROUP_SIZE_Z) @@ -302,31 +309,32 @@ MatMulFillBiasOrZeroBeforeSplitKProgram CreateMatMulFillBiasOrZeroBeforeSplitKPr bool is_gemm, float beta, uint32_t output_components, - const TensorShape& output_shape) { + const TensorShape& output_shape, + uint32_t batch_size) { const bool has_bias = bias != nullptr; const bool bias_is_scalar = has_bias ? bias->Shape().Size() == 1 : false; - // Currently we only support GEMM and channels last format for MatMul with Split-K. MatMulFillBiasOrZeroBeforeSplitKProgram program(is_gemm, has_bias, output_components, bias_is_scalar); const uint32_t dim_a_outer = narrow(output_shape[output_shape.NumDimensions() - 2]); const uint32_t dim_b_outer = narrow(output_shape[output_shape.NumDimensions() - 1]); - // Fill one value per invocation. Now we use default workgroup size (64) for this program. - const uint32_t total_outputs = dim_a_outer * dim_b_outer; - const uint32_t dispatch_x = (total_outputs + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE; + // Fill one value per invocation across all batches. + const uint64_t total_outputs = static_cast(batch_size) * + static_cast(dim_a_outer) * + static_cast(dim_b_outer); + const uint64_t dispatch_x_u64 = CeilDiv(total_outputs, static_cast(WORKGROUP_SIZE)); + ORT_ENFORCE(dispatch_x_u64 <= static_cast(std::numeric_limits::max()), + "dispatch_x exceeds uint32_t range: ", dispatch_x_u64); + const uint32_t dispatch_x = narrow(dispatch_x_u64); - // To reuse `MatMulWriteFnSourceForGemm()` or `MatMulWriteFnSourceForMatMul()` we need to set - // `dim_b_outer` in components when `output_shape` is in `vec4`, while use `output_shape` directly - // as the output shape. const uint32_t dim_b_outer_components = narrow(dim_b_outer * output_components); program.CacheHint(is_gemm, has_bias, output_components, bias_is_scalar) .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, output_shape, static_cast(output_components)}) - .AddUniformVariables({{dim_a_outer}, {dim_b_outer_components}, {beta}}) + .AddUniformVariables({{dim_a_outer}, {dim_b_outer_components}, {beta}, {batch_size}}) .SetDispatchGroupSize(dispatch_x); if (has_bias) { - // We always use `c_components` as `output_components` in GEMM, and 4 in MatMul. const TensorShape reduced_bias_shape = ReduceShapeByComponents(bias->Shape(), output_components); program.AddInput({bias, ProgramTensorMetadataDependency::TypeAndRank, reduced_bias_shape, static_cast(output_components)}); } diff --git a/onnxruntime/core/providers/webgpu/math/matmul.h b/onnxruntime/core/providers/webgpu/math/matmul.h index d15e36ffa3d85..89101c60a1b6c 100644 --- a/onnxruntime/core/providers/webgpu/math/matmul.h +++ b/onnxruntime/core/providers/webgpu/math/matmul.h @@ -24,7 +24,8 @@ MatMulFillBiasOrZeroBeforeSplitKProgram CreateMatMulFillBiasOrZeroBeforeSplitKPr bool is_gemm, float beta, uint32_t output_components, - const TensorShape& output_shape); + const TensorShape& output_shape, + uint32_t batch_size = 1); class MatMul final : public WebGpuKernel { public: diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.cc b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc index 0883c8ddb95b5..0d2a1962dd2a0 100644 --- a/onnxruntime/core/providers/webgpu/math/matmul_packed.cc +++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc @@ -65,7 +65,6 @@ Status MatMulFillBiasOrZeroBeforeSplitKProgram::GenerateShaderCode(ShaderHelper& } // Handle bias with `MatMulWriteFnSourceForGemm() or MatMulWriteFnSourceForMatMul()`. - // const uint32_t bias_components = output_components_; if (is_gemm_) { MatMulWriteFnSourceForGemm(shader, output, bias, bias_is_scalar_); } else { @@ -77,15 +76,18 @@ Status MatMulFillBiasOrZeroBeforeSplitKProgram::GenerateShaderCode(ShaderHelper& shader.MainFunctionBody() << R"( let output_id = i32(global_idx); + let batch_size = i32(uniforms.batch_size); let dim_a_outer = i32(uniforms.dim_a_outer); let dim_b_outer = i32(uniforms.dim_b_outer) / output_components; - if (output_id >= dim_a_outer * dim_b_outer) { + let elements_per_batch = dim_a_outer * dim_b_outer; + if (output_id >= batch_size * elements_per_batch) { return; } - let output_row = output_id / dim_b_outer; - let output_col = output_id % dim_b_outer; - let output_batch = 0; + let output_batch = output_id / elements_per_batch; + let remaining = output_id % elements_per_batch; + let output_row = remaining / dim_b_outer; + let output_col = remaining % dim_b_outer; let output_value = output_value_t(); mm_write(output_batch, output_row, output_col, output_value); )"; diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.h b/onnxruntime/core/providers/webgpu/math/matmul_packed.h index 618fc97d72fe0..eceb79f3c6a98 100644 --- a/onnxruntime/core/providers/webgpu/math/matmul_packed.h +++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.h @@ -27,7 +27,8 @@ class MatMulProgram final : public Program { {"dim_inner", ProgramUniformVariableDataType::Uint32}, {"logical_dispatch_x", ProgramUniformVariableDataType::Uint32}, {"logical_dispatch_y", ProgramUniformVariableDataType::Uint32}, - {"logical_dispatch_z", ProgramUniformVariableDataType::Uint32}); + {"logical_dispatch_z", ProgramUniformVariableDataType::Uint32}, + {"splits_per_batch", ProgramUniformVariableDataType::Uint32}); bool NeedSplitK() const; @@ -58,7 +59,8 @@ class MatMulFillBiasOrZeroBeforeSplitKProgram final : public Program(oss, ","), [](uint32_t i) { return std::to_string(i); }); return oss.str(); }; - program.CacheHint(activation.ToString(), is_channels_last, stringify({inner_element_size, static_cast(is_vec4 ? 1 : 0), fit_a_outer, fit_b_outer, fit_inner, tile_a_outer, tile_a_outer, tile_inner, static_cast(components)})) + + program.CacheHint(activation.ToString(), is_channels_last, stringify({inner_element_size, static_cast(is_vec4 ? 1 : 0), fit_a_outer, fit_b_outer, fit_inner, tile_a_outer, tile_b_outer, tile_inner, static_cast(components)})) .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, reduced_output_shape, components}) .SetDispatchGroupSize(dispatch[0], dispatch[1], dispatch[2]) .SetWorkgroupSize(workgroup_size[0], workgroup_size[1], workgroup_size[2]) diff --git a/onnxruntime/core/providers/webgpu/tensor/grid_sample.cc b/onnxruntime/core/providers/webgpu/tensor/grid_sample.cc new file mode 100644 index 0000000000000..abf7df6f4b8a2 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/grid_sample.cc @@ -0,0 +1,253 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webgpu/tensor/grid_sample.h" + +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" + +namespace onnxruntime { +namespace webgpu { + +Status GridSampleProgram::GenerateShaderCode(ShaderHelper& shader) const { + const auto& x = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + const auto& grid = shader.AddInput("grid", ShaderUsage::UseUniform); + const auto& y = shader.AddOutput("y", ShaderUsage::UseUniform); + + // gs_denormalize: specialized per align_corners + if (align_corners_) { + shader.AdditionalImplementation() + << "fn gs_denormalize(n: f32, length: u32) -> f32 {\n" + << " return (n + 1.0) * 0.5 * f32(length - 1u);\n" + << "}\n"; + } else { + shader.AdditionalImplementation() + << "fn gs_denormalize(n: f32, length: u32) -> f32 {\n" + << " return ((n + 1.0) * f32(length) - 1.0) * 0.5;\n" + << "}\n"; + } + + // gs_reflect: only needed for reflection padding mode + if (padding_mode_ == 2) { + shader.AdditionalImplementation() + << "fn gs_reflect(v: f32, v_min: f32, v_max: f32) -> f32 {\n" + << " var fv = v;\n" + << " let range = v_max - v_min;\n" + << " if (fv < v_min) {\n" + << " let dv = v_min - fv;\n" + << " let n = i32(dv / range);\n" + << " let r = dv - f32(n) * range;\n" + << " fv = select(v_max - r, v_min + r, n % 2 == 0);\n" + << " } else if (fv > v_max) {\n" + << " let dv = fv - v_max;\n" + << " let n = i32(dv / range);\n" + << " let r = dv - f32(n) * range;\n" + << " fv = select(v_min + r, v_max - r, n % 2 == 0);\n" + << " }\n" + << " return fv;\n" + << "}\n"; + } + + // gs_cubic_coeffs: only needed for bicubic mode + if (mode_ == 2) { + shader.AdditionalImplementation() + << "fn gs_cubic_coeffs(t: f32) -> vec4 {\n" + << " let ax = abs(t);\n" + << " let a = -0.75f;\n" + << " let c0 = ((a * (ax + 1.0) - 5.0 * a) * (ax + 1.0) + 8.0 * a) * (ax + 1.0) - 4.0 * a;\n" + << " let c1 = ((a + 2.0) * ax - (a + 3.0)) * ax * ax + 1.0;\n" + << " let c2 = ((a + 2.0) * (1.0 - ax) - (a + 3.0)) * (1.0 - ax) * (1.0 - ax) + 1.0;\n" + << " let c3 = ((a * (2.0 - ax) - 5.0 * a) * (2.0 - ax) + 8.0 * a) * (2.0 - ax) - 4.0 * a;\n" + << " return vec4(c0, c1, c2, c3);\n" + << "}\n"; + } + + // gs_pixel: pixel fetch helper, specialized per padding_mode (and align_corners for reflection) + // Returns f32 always; caller casts to output type. + shader.AdditionalImplementation() + << "fn gs_pixel(img_base: u32, r: i32, col: i32) -> f32 {\n"; + + if (padding_mode_ == 0) { + // zeros: out-of-bounds -> 0 + shader.AdditionalImplementation() + << " if (r < 0 || r >= i32(uniforms.H_in) || col < 0 || col >= i32(uniforms.W_in)) {\n" + << " return 0.0;\n" + << " }\n" + << " return f32(" << x.GetByOffset("img_base + u32(r) * uniforms.W_in + u32(col)") << ");\n"; + } else if (padding_mode_ == 1) { + // border: clamp to nearest edge + shader.AdditionalImplementation() + << " let cr = u32(clamp(r, 0, i32(uniforms.H_in) - 1));\n" + << " let cc = u32(clamp(col, 0, i32(uniforms.W_in) - 1));\n" + << " return f32(" << x.GetByOffset("img_base + cr * uniforms.W_in + cc") << ");\n"; + } else { + // reflection: oscillating reflect, bounds depend on align_corners + if (align_corners_) { + // reflect within [0, length-1] + shader.AdditionalImplementation() + << " let rr = i32(gs_reflect(f32(r), 0.0, f32(uniforms.H_in) - 1.0));\n" + << " let cc = i32(gs_reflect(f32(col), 0.0, f32(uniforms.W_in) - 1.0));\n"; + } else { + // reflect within [-0.5, length-0.5] + shader.AdditionalImplementation() + << " let rr = i32(gs_reflect(f32(r), -0.5, f32(uniforms.H_in) - 0.5));\n" + << " let cc = i32(gs_reflect(f32(col), -0.5, f32(uniforms.W_in) - 0.5));\n"; + } + shader.AdditionalImplementation() + << " let ur = u32(clamp(rr, 0, i32(uniforms.H_in) - 1));\n" + << " let uc = u32(clamp(cc, 0, i32(uniforms.W_in) - 1));\n" + << " return f32(" << x.GetByOffset("img_base + ur * uniforms.W_in + uc") << ");\n"; + } + shader.AdditionalImplementation() << "}\n"; + + // Main function body + auto& body = shader.MainFunctionBody(); + body << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size") + // Decode global_idx -> (n, c, h_out, w_out) + << " let HW_out = uniforms.H_out * uniforms.W_out;\n" + << " let CHW_out = uniforms.C * HW_out;\n" + << " let n = global_idx / CHW_out;\n" + << " let rem = global_idx % CHW_out;\n" + << " let c = rem / HW_out;\n" + << " let hw = rem % HW_out;\n" + << " let h_out = hw / uniforms.W_out;\n" + << " let w_out = hw % uniforms.W_out;\n" + // Read normalized grid coords: grid is [N, H_out, W_out, 2], gx=x-coord (W), gy=y-coord (H) + << " let grid_base = ((n * uniforms.H_out + h_out) * uniforms.W_out + w_out) * 2u;\n" + << " let gx = f32(" << grid.GetByOffset("grid_base") << ");\n" + << " let gy = f32(" << grid.GetByOffset("grid_base + 1u") << ");\n" + // Denormalize to image-space coordinates + << " let px = gs_denormalize(gx, uniforms.W_in);\n" + << " let py = gs_denormalize(gy, uniforms.H_in);\n" + // Base flat offset for this (n, c) plane of X: [N, C, H_in, W_in] + << " let img_base = (n * uniforms.C + c) * uniforms.H_in * uniforms.W_in;\n"; + + if (mode_ == 1) { + // nearest: round to nearest integer + body << " let rx = i32(round(px));\n" + << " let ry = i32(round(py));\n" + << " let result = gs_pixel(img_base, ry, rx);\n"; + } else if (mode_ == 0) { + // bilinear: 4-neighbor weighted interpolation + body << " let x1 = i32(floor(px));\n" + << " let y1 = i32(floor(py));\n" + << " let x2 = x1 + 1;\n" + << " let y2 = y1 + 1;\n" + << " let dx1 = px - f32(x1);\n" + << " let dx2 = 1.0 - dx1;\n" + << " let dy1 = py - f32(y1);\n" + << " let dy2 = 1.0 - dy1;\n" + << " let p11 = gs_pixel(img_base, y1, x1);\n" + << " let p12 = gs_pixel(img_base, y1, x2);\n" + << " let p21 = gs_pixel(img_base, y2, x1);\n" + << " let p22 = gs_pixel(img_base, y2, x2);\n" + << " let result = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22);\n"; + } else { + // bicubic: 4x4 neighborhood with Robert Keys coefficients (alpha=-0.75) + body << " let x0 = i32(floor(px)) - 1;\n" + << " let y0 = i32(floor(py)) - 1;\n" + << " let dx = px - f32(x0 + 1);\n" + << " let dy = py - f32(y0 + 1);\n" + << " let cx = gs_cubic_coeffs(dx);\n" + << " let cy = gs_cubic_coeffs(dy);\n" + << " var rows: vec4;\n" + << " for (var i = 0i; i < 4i; i++) {\n" + << " let row = y0 + i;\n" + << " rows[i] = cx[0] * gs_pixel(img_base, row, x0 )\n" + << " + cx[1] * gs_pixel(img_base, row, x0 + 1)\n" + << " + cx[2] * gs_pixel(img_base, row, x0 + 2)\n" + << " + cx[3] * gs_pixel(img_base, row, x0 + 3);\n" + << " }\n" + << " let result = dot(cy, rows);\n"; + } + + body << " " << y.SetByOffset("global_idx", "x_value_t(result)") << "\n"; + + return Status::OK(); +} + +GridSample::GridSample(const OpKernelInfo& info) : WebGpuKernel(info) { + // Accept both opset-16 names ("bilinear"/"bicubic") and opset-20+ names ("linear"/"cubic") + std::string mode_str = info.GetAttrOrDefault("mode", "bilinear"); + if (mode_str == "bilinear" || mode_str == "linear") { + mode_ = 0; + } else if (mode_str == "nearest") { + mode_ = 1; + } else if (mode_str == "bicubic" || mode_str == "cubic") { + mode_ = 2; + } else { + ORT_THROW("GridSample: unsupported mode \"", mode_str, "\""); + } + + std::string padding_mode_str = info.GetAttrOrDefault("padding_mode", "zeros"); + if (padding_mode_str == "zeros") { + padding_mode_ = 0; + } else if (padding_mode_str == "border") { + padding_mode_ = 1; + } else if (padding_mode_str == "reflection") { + padding_mode_ = 2; + } else { + ORT_THROW("GridSample: unsupported padding_mode \"", padding_mode_str, "\""); + } + + align_corners_ = static_cast(info.GetAttrOrDefault("align_corners", 0)); +} + +Status GridSample::ComputeInternal(ComputeContext& context) const { + const auto* X = context.Input(0); + const auto* grid = context.Input(1); + + const auto& X_shape = X->Shape(); + const auto& grid_shape = grid->Shape(); + + ORT_RETURN_IF_NOT(X_shape.NumDimensions() == 4, "X must be 4-D for opset 16"); + ORT_RETURN_IF_NOT(grid_shape.NumDimensions() == 4, "grid must be 4-D"); + ORT_RETURN_IF_NOT(grid_shape[3] == 2, "grid last dimension must be 2"); + + const int64_t N = X_shape[0]; + const int64_t C = X_shape[1]; + const int64_t H_in = X_shape[2]; + const int64_t W_in = X_shape[3]; + + ORT_RETURN_IF_NOT(grid_shape[0] == N, "grid batch size must match X batch size"); + + const int64_t H_out = grid_shape[1]; + const int64_t W_out = grid_shape[2]; + + TensorShape Y_shape{N, C, H_out, W_out}; + auto* Y = context.Output(0, Y_shape); + + const uint32_t output_size = onnxruntime::narrow(Y_shape.Size()); + if (output_size == 0) { + return Status::OK(); + } + + GridSampleProgram program{mode_, padding_mode_, align_corners_}; + program + .AddInputs({{X, ProgramTensorMetadataDependency::TypeAndRank}, + {grid, ProgramTensorMetadataDependency::TypeAndRank}}) + .AddOutput({Y, ProgramTensorMetadataDependency::Rank}) + .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .CacheHint(mode_, padding_mode_, static_cast(align_corners_)) + .AddUniformVariables({{output_size}, + {static_cast(C)}, + {static_cast(H_in)}, + {static_cast(W_in)}, + {static_cast(H_out)}, + {static_cast(W_out)}}); + + return context.RunProgram(program); +} + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + GridSample, + kOnnxDomain, + 16, 19, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T1", WebGpuSupportedFloatTypes()) + .TypeConstraint("T2", WebGpuSupportedFloatTypes()), + GridSample); + +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/grid_sample.h b/onnxruntime/core/providers/webgpu/tensor/grid_sample.h new file mode 100644 index 0000000000000..acc100c725009 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/grid_sample.h @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/program.h" +#include "core/providers/webgpu/webgpu_kernel.h" + +namespace onnxruntime { +namespace webgpu { + +// mode: 0=bilinear(linear), 1=nearest, 2=bicubic(cubic) +// padding_mode: 0=zeros, 1=border, 2=reflection + +class GridSampleProgram final : public Program { + public: + GridSampleProgram(int mode, int padding_mode, bool align_corners) + : Program{"GridSample"}, + mode_{mode}, + padding_mode_{padding_mode}, + align_corners_{align_corners} {} + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES( + {"output_size", ProgramUniformVariableDataType::Uint32}, + {"C", ProgramUniformVariableDataType::Uint32}, + {"H_in", ProgramUniformVariableDataType::Uint32}, + {"W_in", ProgramUniformVariableDataType::Uint32}, + {"H_out", ProgramUniformVariableDataType::Uint32}, + {"W_out", ProgramUniformVariableDataType::Uint32}); + + private: + int mode_; + int padding_mode_; + bool align_corners_; +}; + +class GridSample final : public WebGpuKernel { + public: + explicit GridSample(const OpKernelInfo& info); + Status ComputeInternal(ComputeContext& context) const override; + + private: + int mode_{0}; + int padding_mode_{0}; + bool align_corners_{false}; +}; + +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc b/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc index 0362deb0fbd6a..b6ec2e0c2b10b 100644 --- a/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc +++ b/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc @@ -55,18 +55,18 @@ Status ApplyMatMulIntel(ComputeContext& context, TensorShape output_shape = helper.OutputShape(); - const int64_t dim_output_outer = output_shape[output_shape.NumDimensions() - 2]; - // check if A is batch of vector (bach is not 1, M is 1) and B is a matrix (batch is 1) - if (batchA != 1 && dim_output_outer == 1 && batchB == 1) { - // optimization for batched vector matrix multiplication - // dimensions of A: [1,`batchA`,K] - TensorShapeVector dims_a = {1, batchA, helper.K()}; + // When B is a matrix (batch is 1), we fold batchA into the M dimension for better + // performance (e.g., [2,3,5] → [1,6,5]). + if (batchA != 1 && batchB == 1) { + // dimensions of A: [1,`batchA`, M, K] + int64_t batchAndM = a_shape.SizeToDimension(a_shape.NumDimensions() - 1); + TensorShapeVector dims_a = {1, batchAndM, helper.K()}; // dimensions of B: [1,K,N] TensorShapeVector dims_b = {1, helper.K(), helper.N()}; a_shape = TensorShape(dims_a); b_shape = TensorShape(dims_b); - output_shape = {1, batchA, helper.N()}; + output_shape = {1, batchAndM, helper.N()}; } // helpful dimension variables diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index d85f5011ea043..d1cde04277938 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -31,6 +31,7 @@ #include "core/providers/webgpu/webgpu_profiler.h" #include "core/providers/webgpu/tensor/cast.h" #include "core/providers/webgpu/tensor/expand.h" +#include "core/providers/webgpu/tensor/grid_sample.h" #include "core/providers/webgpu/generator/range.h" #include "core/providers/webgpu/tensor/unsqueeze.h" @@ -448,6 +449,8 @@ static const BuildKernelCreateInfoFn build_kernel_create_info_function_table[] = BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + + BuildKernelCreateInfo, }; std::unique_ptr RegisterKernels(bool enable_graph_capture, bool enable_int64) { @@ -716,6 +719,11 @@ std::optional WebGpuExecutionProvider::ShouldConvertDataLayoutForOp(std::s return target_data_layout != DataLayout::NHWC; } + // GridSample is NCHW-only (opset 16 spec requires NCHW input) + if (node_domain == kOnnxDomain && node_op_type == "GridSample") { + return target_data_layout != DataLayout::NHWC; + } + // WebGPU perfer NCHW for InstanceNormalization due to a better performance if (node_domain == kOnnxDomain && node_op_type == "InstanceNormalization") { return target_data_layout != DataLayout::NHWC; diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.cc b/onnxruntime/core/providers/webgpu/webgpu_utils.cc index 5127801ca8451..ec0664c5fdb6a 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_utils.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_utils.cc @@ -36,34 +36,36 @@ SplitKConfig::SplitKConfig(const wgpu::AdapterInfo& adapter_info) { } else if (adapter_info.architecture == std::string_view{"xe-2lpg"} || adapter_info.architecture == std::string_view{"xe-2hpg"} || adapter_info.architecture == std::string_view{"gen-12hp"}) { - // Below thresholds are only verified on Intel discreate GPUs and Lunar Lake iGPUs. + // Below thresholds are only verified on Intel discrete GPUs and Lunar Lake iGPUs. enable_split_k_ = true; + max_batch_size_ = 8; split_dim_inner_ = 256; min_dim_inner_with_split_k_ = split_dim_inner_ * 2; - configs_per_dim_inner_range_.emplace_back(768, 52.0f); - configs_per_dim_inner_range_.emplace_back(2304, 35.0f); - configs_per_dim_inner_range_.emplace_back(3072, 21.5f); - configs_per_dim_inner_range_.emplace_back(4096, 16.0f); + configs_per_dim_inner_range_.emplace_back(768, 52.0); + configs_per_dim_inner_range_.emplace_back(2304, 35.0); + configs_per_dim_inner_range_.emplace_back(3072, 21.5); + configs_per_dim_inner_range_.emplace_back(4096, 16.0); } else { // Below are the default thresholds on newer Intel GPUs. These values are chosen on // Intel "gen-12lp" GPU with 32EUs. enable_split_k_ = true; + max_batch_size_ = 8; split_dim_inner_ = 256; min_dim_inner_with_split_k_ = split_dim_inner_ * 2; - configs_per_dim_inner_range_.emplace_back(768, 20.0f); - configs_per_dim_inner_range_.emplace_back(1792, 13.0f); - configs_per_dim_inner_range_.emplace_back(3072, 8.0f); - configs_per_dim_inner_range_.emplace_back(4096, 6.0f); + configs_per_dim_inner_range_.emplace_back(768, 20.0); + configs_per_dim_inner_range_.emplace_back(1792, 13.0); + configs_per_dim_inner_range_.emplace_back(3072, 8.0); + configs_per_dim_inner_range_.emplace_back(4096, 6.0); } } } -SplitKConfig::ConfigAtRange::ConfigAtRange(uint32_t max_dim_inner, float rate) - : max_dim_inner_with_rate(max_dim_inner), max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner(rate) {} +SplitKConfig::ConfigAtRange::ConfigAtRange(uint32_t max_dim_inner, double rate) + : max_dim_inner_with_rate(max_dim_inner), max_dim_a_outer_x_dim_b_outer_x_batch_size_divides_dim_inner(rate) {} uint32_t SplitKConfig::GetMaxDimInnerWithSplitK() const { assert(!configs_per_dim_inner_range_.empty()); @@ -87,7 +89,10 @@ bool SplitKConfig::UseSplitK( // TODO: support the cases below. use_split_k &= activation_kind == ActivationKind::None; use_split_k &= is_vec4; - use_split_k &= batch_size == 1; + + // Larger batches increase parallelism on their own, so we temporarily set a batch size threshold + // for using Split-K. + use_split_k &= batch_size <= max_batch_size_; // `is_channels_last` should only affect Split-K gating when bias is applied in the non-GEMM // MatMul/Conv|MatMul path. For GEMM and for MatMul or Conv|MatMul without bias, we need to @@ -97,8 +102,8 @@ bool SplitKConfig::UseSplitK( use_split_k &= is_channels_last; // Split-K works best when `dim_inner` is relatively large compared with `dim_a_outer` and - // `dim_b_outer`. Currently we use the factor between `(dim_a_outer * dim_b_outer)` and - // `dim_inner)` as the metric to decide whether to use Split-K or not. + // `dim_b_outer`. Currently we use the factor between `(dim_a_outer * dim_b_outer * batch_size)` + // and `dim_inner` as the metric to decide whether to use Split-K or not. use_split_k &= dim_inner >= min_dim_inner_with_split_k_; use_split_k &= dim_inner <= GetMaxDimInnerWithSplitK(); @@ -106,10 +111,10 @@ bool SplitKConfig::UseSplitK( return false; } - const float rate = dim_a_outer * dim_b_outer * 1.0f / dim_inner; + const double rate = static_cast(dim_a_outer) * static_cast(dim_b_outer) * static_cast(batch_size) / static_cast(dim_inner); for (const auto& config_at_range : configs_per_dim_inner_range_) { if (dim_inner <= config_at_range.max_dim_inner_with_rate) { - return rate <= config_at_range.max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner; + return rate <= config_at_range.max_dim_a_outer_x_dim_b_outer_x_batch_size_divides_dim_inner; } } return false; diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.h b/onnxruntime/core/providers/webgpu/webgpu_utils.h index cbceaf2be120d..d4bb245e3e9e8 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_utils.h +++ b/onnxruntime/core/providers/webgpu/webgpu_utils.h @@ -115,13 +115,14 @@ class SplitKConfig { bool enable_split_k_ = false; uint32_t split_dim_inner_ = 0; uint32_t min_dim_inner_with_split_k_ = 0; + uint32_t max_batch_size_ = 0; uint32_t GetMaxDimInnerWithSplitK() const; struct ConfigAtRange { - ConfigAtRange(uint32_t max_dim_inner, float rate); + ConfigAtRange(uint32_t max_dim_inner, double rate); uint32_t max_dim_inner_with_rate = 0; - float max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner = 0.0f; + double max_dim_a_outer_x_dim_b_outer_x_batch_size_divides_dim_inner = 0.0; }; std::vector configs_per_dim_inner_range_; }; diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc index 727531f6a42d5..37b3c8eae7ebd 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc @@ -95,8 +95,10 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, options.set("padding", emscripten::val::array(padding)); const auto ceil_mode = helper.Get("ceil_mode", 0); - options.set("roundingType", ceil_mode == 0 ? emscripten::val("floor") - : emscripten::val("ceil")); + emscripten::val output_shape_rounding = ceil_mode == 0 ? emscripten::val("floor") : emscripten::val("ceil"); + // WebNN renamed roundingType to outputShapeRounding, but set older name too for compatibility. + options.set("roundingType", output_shape_rounding); + options.set("outputShapeRounding", output_shape_rounding); // WebNN doesn't support AveragePool with count_include_pad == 1, emulate it by pad + averagePool2d. if (op_type == "AveragePool" && helper.Get("count_include_pad", 0) == 1) { diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index ff4c08c9b14c0..3f28529e7a847 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -432,6 +432,112 @@ union PtrConvert { const char** strings; }; +// Shared validation for COO indices used by both FillSparseTensorCoo and UseCooIndices. +// Returns nullptr on success, OrtStatus* on validation failure. +OrtStatus* ValidateCooIndices(const int64_t* indices_data, size_t indices_num, + size_t values_size, const TensorShape& dense_shape) { + if (indices_num > 0 && indices_data == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "indices_data must not be null when indices_num > 0."); + } + if ((values_size == 0) != (indices_num == 0)) { + return OrtApis::CreateStatus( + ORT_INVALID_ARGUMENT, + values_size == 0 + ? "COO indices must be empty when the sparse tensor has no values." + : "COO indices must be provided when the sparse tensor has values."); + } + if (values_size > 0 && indices_num > 0) { + if (indices_num == values_size) { + const auto dense_size = dense_shape.Size(); + for (size_t i = 0; i < indices_num; ++i) { + if (indices_data[i] < 0 || indices_data[i] >= dense_size) { + return OrtApis::CreateStatus( + ORT_INVALID_ARGUMENT, + MakeString("COO linear index out of bounds: ", indices_data[i], + " must be in [0, ", dense_size, ")") + .c_str()); + } + } + } else if (indices_num / 2 == values_size && indices_num % 2 == 0) { + if (dense_shape.NumDimensions() != 2) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "COO 2D indices require dense shape of 2 dimensions"); + } + const auto rows = dense_shape.GetDims()[0]; + const auto cols = dense_shape.GetDims()[1]; + size_t tuple_idx = 0; + for (size_t i = 0; i < values_size; ++i, tuple_idx += 2) { + auto r = indices_data[tuple_idx]; + auto c = indices_data[tuple_idx + 1]; + if (r < 0 || r >= rows || c < 0 || c >= cols) { + return OrtApis::CreateStatus( + ORT_INVALID_ARGUMENT, + MakeString("COO 2D index out of bounds: (", r, ", ", c, + ") must be in [0, ", rows, ") x [0, ", cols, ")") + .c_str()); + } + } + } else { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "COO indices count must be equal to or twice the values count."); + } + } + return nullptr; +} + +// Shared validation for CSR indices used by both FillSparseTensorCsr and UseCsrIndices. +// Returns nullptr on success, OrtStatus* on validation failure. +OrtStatus* ValidateCsrIndices(const int64_t* inner_data, size_t inner_num, + const int64_t* outer_data, size_t outer_num, + const TensorShape& dense_shape) { + if (inner_num > 0 && inner_data == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "inner index data must not be null when inner index count > 0."); + } + if (outer_num > 0 && outer_data == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + "outer index data must not be null when outer index count > 0."); + } + if (dense_shape.NumDimensions() == 2 && inner_num > 0) { + const auto cols = dense_shape.GetDims()[1]; + for (size_t i = 0; i < inner_num; ++i) { + if (inner_data[i] < 0 || inner_data[i] >= cols) { + return OrtApis::CreateStatus( + ORT_INVALID_ARGUMENT, + MakeString("CSR inner index out of bounds: ", inner_data[i], + " must be in [0, ", cols, ")") + .c_str()); + } + } + } + if (outer_num > 0) { + if (outer_data[0] != 0) { + return OrtApis::CreateStatus( + ORT_INVALID_ARGUMENT, + MakeString("CSR outer index must start at 0, got: ", outer_data[0]).c_str()); + } + if (outer_data[outer_num - 1] != static_cast(inner_num)) { + return OrtApis::CreateStatus( + ORT_INVALID_ARGUMENT, + MakeString("CSR outer index last element must equal inner index count (", + inner_num, "), got: ", outer_data[outer_num - 1]) + .c_str()); + } + int64_t prev = 0; + for (size_t i = 0; i < outer_num; ++i) { + auto val = outer_data[i]; + if (val < prev || val > static_cast(inner_num)) { + return OrtApis::CreateStatus( + ORT_INVALID_ARGUMENT, + MakeString("CSR outer index out of bounds or not monotonically non-decreasing: ", val).c_str()); + } + prev = val; + } + } + return nullptr; +} + #endif // !defined(DISABLE_SPARSE_TENSORS) } // namespace @@ -446,6 +552,11 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCoo, _Inout_ OrtValue* ort_value, _ auto values_size = narrow(values_t_shape.Size()); auto indices_span = gsl::make_span(indices_data, indices_num); + if (auto* status = ValidateCooIndices(indices_data, indices_num, values_size, + sparse_tensor.DenseShape())) { + return status; + } + if (sparse_tensor.IsDataTypeString()) { PtrConvert conv(values); ORT_THROW_IF_ERROR(sparse_tensor.MakeCooStrings(values_size, conv.strings, indices_span)); @@ -481,6 +592,13 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCsr, _Inout_ OrtValue* ort_value, _ auto inner_indices_span = gsl::make_span(inner_indices_data, inner_indices_num); auto outer_indices_span = gsl::make_span(outer_indices_data, outer_indices_num); + + if (auto* status = ValidateCsrIndices(inner_indices_data, inner_indices_num, + outer_indices_data, outer_indices_num, + sparse_tensor.DenseShape())) { + return status; + } + if (sparse_tensor.IsDataTypeString()) { PtrConvert conv(values); ORT_THROW_IF_ERROR(sparse_tensor.MakeCsrStrings(values_size, conv.strings, inner_indices_span, outer_indices_span)); @@ -592,6 +710,12 @@ ORT_API_STATUS_IMPL(OrtApis::UseCooIndices, _Inout_ OrtValue* ort_value, _Inout_ ? gsl::span() : gsl::make_span(indices_data, indices_num); + if (auto* status = ValidateCooIndices(indices_data, indices_num, + sparse_tensor.NumValues(), + sparse_tensor.DenseShape())) { + return status; + } + ORT_THROW_IF_ERROR(sparse_tensor.UseCooIndices(indices_span)); return nullptr; #else @@ -616,6 +740,12 @@ ORT_API_STATUS_IMPL(OrtApis::UseCsrIndices, _Inout_ OrtValue* ort_value, auto outer_span = (outer_num == 0 || outer_data == nullptr) ? gsl::span() : gsl::make_span(outer_data, outer_num); + + if (auto* status = ValidateCsrIndices(inner_data, inner_num, outer_data, outer_num, + sparse_tensor.DenseShape())) { + return status; + } + ORT_THROW_IF_ERROR(sparse_tensor.UseCsrIndices(inner_span, outer_span)); return nullptr; #else diff --git a/onnxruntime/core/session/ort_version_check.h b/onnxruntime/core/session/ort_version_check.h index 82fd757e3ce9f..f8fab0367b17d 100644 --- a/onnxruntime/core/session/ort_version_check.h +++ b/onnxruntime/core/session/ort_version_check.h @@ -10,21 +10,20 @@ namespace onnxruntime::version_check { -// A simple consteval-friendly result type for ParseUint. -// std::optional triggers an internal compiler error in MSVC 14.44 when used with consteval. +// A simple constexpr-friendly result type for ParseUint. struct ParseUintResult { uint32_t value; bool has_value; - consteval bool operator==(uint32_t other) const { return has_value && value == other; } - consteval bool operator!=(uint32_t other) const { return !(*this == other); } + constexpr bool operator==(uint32_t other) const { return has_value && value == other; } + constexpr bool operator!=(uint32_t other) const { return !(*this == other); } }; -inline consteval ParseUintResult ParseUintNone() { return {0, false}; } +inline constexpr ParseUintResult ParseUintNone() { return {0, false}; } // Parse a non-negative integer from a string_view without leading zeros. // Returns a result with has_value == false on failure (empty, leading zero, non-digit, or overflow). -consteval ParseUintResult ParseUint(std::string_view str) { +constexpr ParseUintResult ParseUint(std::string_view str) { if (str.empty()) return ParseUintNone(); // Leading zeros are not allowed (except "0" itself). if (str.size() > 1 && str[0] == '0') return ParseUintNone(); @@ -42,7 +41,7 @@ consteval ParseUintResult ParseUint(std::string_view str) { // - Major version is 1 // - Y and Z are non-negative integers without leading zeros // - Y (minor version) must equal expected_api_version (defaults to ORT_API_VERSION) -consteval bool IsOrtVersionValid(std::string_view version, uint32_t expected_api_version = ORT_API_VERSION) { +constexpr bool IsOrtVersionValid(std::string_view version, uint32_t expected_api_version = ORT_API_VERSION) { size_t first_dot = version.find('.'); if (first_dot == std::string_view::npos) return false; size_t second_dot = version.find('.', first_dot + 1); diff --git a/onnxruntime/python/backend/backend.py b/onnxruntime/python/backend/backend.py index 19f46189e2933..69be7a7657adf 100644 --- a/onnxruntime/python/backend/backend.py +++ b/onnxruntime/python/backend/backend.py @@ -17,6 +17,29 @@ from onnxruntime import InferenceSession, SessionOptions, get_available_providers, get_device from onnxruntime.backend.backend_rep import OnnxRuntimeBackendRep +# Allowlist of SessionOptions attributes that are safe to set via the backend API. +# Dangerous attributes intentionally excluded: +# optimized_model_filepath — triggers Model::Save(), overwrites arbitrary files +# profile_file_prefix — writes profiling JSON to arbitrary path +# enable_profiling — causes uncontrolled file writes to cwd +_ALLOWED_SESSION_OPTIONS = frozenset( + { + "enable_cpu_mem_arena", + "enable_mem_pattern", + "enable_mem_reuse", + "execution_mode", + "execution_order", + "graph_optimization_level", + "inter_op_num_threads", + "intra_op_num_threads", + "log_severity_level", + "log_verbosity_level", + "logid", + "use_deterministic_compute", + "use_per_session_threads", + } +) + class OnnxRuntimeBackend(Backend): """ @@ -93,16 +116,18 @@ def supports_device(cls, device): @classmethod def prepare(cls, model, device=None, **kwargs): """ - Load the model and creates a :class:`onnxruntime.InferenceSession` + Load the model and creates an :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep` ready to be used as a backend. - :param model: ModelProto (returned by `onnx.load`), - string for a filename or bytes for a serialized model + :param model: the model to prepare — accepts a file path (str), serialized + model (bytes), :class:`onnx.ModelProto`, :class:`onnxruntime.InferenceSession`, + or :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep` (returned as-is) :param device: requested device for the computation, None means the default one which depends on the compilation settings - :param kwargs: see :class:`onnxruntime.SessionOptions` - :return: :class:`onnxruntime.InferenceSession` + :param kwargs: only a safe subset of :class:`onnxruntime.SessionOptions` attributes are + accepted; see ``_ALLOWED_SESSION_OPTIONS`` for the list + :return: :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep` """ if isinstance(model, OnnxRuntimeBackendRep): return model @@ -111,8 +136,14 @@ def prepare(cls, model, device=None, **kwargs): elif isinstance(model, (str, bytes)): options = SessionOptions() for k, v in kwargs.items(): - if hasattr(options, k): + if k in _ALLOWED_SESSION_OPTIONS: setattr(options, k, v) + elif hasattr(options, k): + raise RuntimeError( + f"SessionOptions attribute '{k}' is not permitted via the backend API. " + f"Allowed attributes: {', '.join(sorted(_ALLOWED_SESSION_OPTIONS))}" + ) + # else: silently ignore unknown keys excluded_providers = os.getenv("ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS", default="").split(",") providers = [x for x in get_available_providers() if (x not in excluded_providers)] @@ -148,13 +179,21 @@ def run_model(cls, model, inputs, device=None, **kwargs): """ Compute the prediction. - :param model: :class:`onnxruntime.InferenceSession` returned - by function *prepare* + :param model: the model to run — accepts a file path (str), serialized + model (bytes), :class:`onnx.ModelProto`, :class:`onnxruntime.InferenceSession`, + or :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep` :param inputs: inputs :param device: requested device for the computation, None means the default one which depends on the compilation settings - :param kwargs: see :class:`onnxruntime.RunOptions` + :param kwargs: ``run_model()`` forwards kwargs to both ``prepare()`` and ``rep.run()``. + ``prepare()`` validates and applies ``_ALLOWED_SESSION_OPTIONS`` only when creating + a new session from a model path or bytes; if ``model`` is already an + ``InferenceSession`` or ``OnnxRuntimeBackendRep``, session-option kwargs are + silently ignored. ``rep.run()`` always validates against ``_ALLOWED_RUN_OPTIONS`` + and raises ``RuntimeError`` for known-but-blocked run attributes. + Logging-related kwargs (``log_severity_level``, ``log_verbosity_level``, ``logid``) + appear in both allowlists. :return: predictions """ rep = cls.prepare(model, device, **kwargs) diff --git a/onnxruntime/python/backend/backend_rep.py b/onnxruntime/python/backend/backend_rep.py index a30569d004d34..950ce417c6c2d 100644 --- a/onnxruntime/python/backend/backend_rep.py +++ b/onnxruntime/python/backend/backend_rep.py @@ -10,11 +10,23 @@ from onnxruntime import RunOptions +# Allowlist of RunOptions attributes that are safe to set via the backend API. +# 'terminate' excluded: setting it True would deny the current inference call. +# 'training_mode' excluded: silently switches inference behavior in training builds. +_ALLOWED_RUN_OPTIONS = frozenset( + { + "log_severity_level", + "log_verbosity_level", + "logid", + "only_execute_path_to_fetches", + } +) + class OnnxRuntimeBackendRep(BackendRep): """ - Computes the prediction for a pipeline converted into - an :class:`onnxruntime.InferenceSession` node. + Wraps an :class:`onnxruntime.InferenceSession` to implement ONNX's + :class:`onnx.backend.base.BackendRep` interface for running predictions. """ def __init__(self, session): @@ -27,12 +39,24 @@ def run(self, inputs, **kwargs): # type: (Any, **Any) -> Tuple[Any, ...] """ Computes the prediction. See :meth:`onnxruntime.InferenceSession.run`. + + :param inputs: a list of input arrays (one per model input) or a single + array when the model has exactly one input + :param kwargs: only a safe subset of :class:`onnxruntime.RunOptions` attributes are + accepted; see ``_ALLOWED_RUN_OPTIONS`` for the list + :return: list of output arrays """ options = RunOptions() for k, v in kwargs.items(): - if hasattr(options, k): + if k in _ALLOWED_RUN_OPTIONS: setattr(options, k, v) + elif hasattr(options, k): + raise RuntimeError( + f"RunOptions attribute '{k}' is not permitted via the backend API. " + f"Allowed attributes: {', '.join(sorted(_ALLOWED_RUN_OPTIONS))}" + ) + # else: silently ignore unknown keys if isinstance(inputs, list): inps = {} diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py index def2240358c10..e35e3c5753d36 100644 --- a/onnxruntime/python/onnxruntime_inference_collection.py +++ b/onnxruntime/python/onnxruntime_inference_collection.py @@ -13,10 +13,11 @@ from enum import IntEnum from typing import Any +import numpy as np + from onnxruntime.capi import _pybind_state as C if typing.TYPE_CHECKING: - import numpy as np import numpy.typing as npt import onnxruntime @@ -1212,8 +1213,6 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: If ``None`` (default), a copy will be made only if needed. :return: A numpy array with the same data as the OrtValue. """ - import numpy as np # noqa: PLC0415 - arr = self.numpy() if copy is not None: @@ -1302,15 +1301,25 @@ def from_dlpack(cls, data, /) -> OrtValue: return cls(C.OrtValue.from_dlpack(capsule, is_bool)) - def update_inplace(self, np_arr) -> None: + def update_inplace(self, data) -> None: """ - Update the OrtValue in place with a new Numpy array. The numpy contents - are copied over to the device memory backing the OrtValue. It can be used - to update the input valuess for an InferenceSession with CUDA graph - enabled or other scenarios where the OrtValue needs to be updated while - the memory address can not be changed. + Update the OrtValue in place. The source data is copied over to the device + memory backing the OrtValue. It can be used to update the input values for + an InferenceSession with CUDA graph enabled or other scenarios where the + OrtValue needs to be updated while the memory address can not be changed. + + :param data: The source data, which can be a Numpy array or another OrtValue. + When an OrtValue is provided, data can be copied between devices (e.g., + GPU to GPU) without going through the CPU. """ - self._ortvalue.update_inplace(np_arr) + if isinstance(data, OrtValue): + self._ortvalue.update_inplace(data._ortvalue) + return + + if not isinstance(data, np.ndarray): + raise TypeError("data must be a numpy.ndarray or an OrtValue.") + + self._ortvalue.update_inplace(data) def copy_tensors(src: Sequence[OrtValue], dst: Sequence[OrtValue], stream=None) -> None: diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc index 89651c2d955de..fa609fe6ea83d 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc @@ -1071,5 +1071,116 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const } } +void UpdateOrtValueInplace(OrtValue& dst, const OrtValue& src) { + if (!dst.IsTensor()) { + throw std::runtime_error("Inplace update of OrtValues is only supported for Tensors"); + } + if (!src.IsTensor()) { + throw std::runtime_error("The source OrtValue must contain a Tensor"); + } + + const auto& dst_tensor = dst.Get(); + const auto& src_tensor = src.Get(); + + if (dst_tensor.DataType() != src_tensor.DataType()) { + throw std::runtime_error("The source and destination OrtValues must have the same data type"); + } + + if (dst_tensor.Shape().Size() != src_tensor.Shape().Size()) { + throw std::runtime_error("The source and destination OrtValues must have the same size"); + } + + if (dst_tensor.IsDataTypeString()) { + throw std::runtime_error("Inplace update of string tensors is not supported"); + } + + size_t bytes = 0; + auto status = Tensor::CalculateTensorStorageSize(dst_tensor.DataType(), dst_tensor.Shape(), 0, bytes); + if (!status.IsOK()) { + throw std::runtime_error(status.ErrorMessage()); + } + + const auto src_device = src_tensor.Location().device; + const auto dst_device = dst_tensor.Location().device; + + void* dst_ptr = dst.GetMutable()->MutableDataRaw(); + const void* src_ptr = src_tensor.DataRaw(); + + if (src_device.UsesCpuMemory() && dst_device.UsesCpuMemory()) { + memcpy(dst_ptr, src_ptr, bytes); + } else { + auto copy_fn = CreateDataTransferMemCpy(src_device, dst_device); + if (!copy_fn) { + // Fall back to built-in EP copy functions. + // Gate each path on (Type, VendorId) so that builds with multiple GPU EPs + // (e.g. CUDA + DML) route through the correct backend. +#ifdef USE_CUDA + const auto is_cuda_device = [](const OrtDevice& device) { + return device.Type() == OrtDevice::GPU && device.Vendor() == OrtDevice::VendorIds::NVIDIA; + }; + + if (is_cuda_device(src_device) && is_cuda_device(dst_device)) { + auto data_transfer = GetGPUDataTransfer(); + ORT_THROW_IF_ERROR(data_transfer->CopyTensor(src_tensor, *dst.GetMutable())); + return; + } + if (src_device.UsesCpuMemory() && is_cuda_device(dst_device)) { + CpuToCudaMemCpy(dst_ptr, src_ptr, bytes); + return; + } + if (is_cuda_device(src_device) && dst_device.UsesCpuMemory()) { + CudaToCpuMemCpy(dst_ptr, src_ptr, bytes); + return; + } +#endif +#if USE_MIGRAPHX + const auto is_migraphx_device = [](const OrtDevice& device) { + return device.Type() == OrtDevice::GPU && device.Vendor() == OrtDevice::VendorIds::AMD; + }; + + if (src_device.UsesCpuMemory() && is_migraphx_device(dst_device)) { + CpuToMIGraphXMemCpy(dst_ptr, src_ptr, bytes); + return; + } + if (is_migraphx_device(src_device) && dst_device.UsesCpuMemory()) { + MIGraphXToCpuMemCpy(dst_ptr, src_ptr, bytes); + return; + } +#endif +#if USE_DML + const auto is_dml_device = [](const OrtDevice& device) { + return (device.Type() == OrtDevice::GPU && device.Vendor() == OrtDevice::VendorIds::MICROSOFT) || + device.Type() == OrtDevice::DML; + }; + + if (src_device.UsesCpuMemory() && is_dml_device(dst_device)) { + CpuToDmlMemCpy(dst_ptr, src_ptr, bytes); + return; + } + if (is_dml_device(src_device) && dst_device.UsesCpuMemory()) { + DmlToCpuMemCpy(dst_ptr, src_ptr, bytes); + return; + } +#endif +#ifdef USE_CANN + const auto is_cann_device = [](const OrtDevice& device) { + return device.Type() == OrtDevice::NPU && device.Vendor() == OrtDevice::VendorIds::HUAWEI; + }; + + if (src_device.UsesCpuMemory() && is_cann_device(dst_device)) { + CpuToCannMemCpy(dst_ptr, src_ptr, bytes); + return; + } + if (is_cann_device(src_device) && dst_device.UsesCpuMemory()) { + CannToCpuMemCpy(dst_ptr, src_ptr, bytes); + return; + } +#endif + throw std::runtime_error("Unable to copy data between the source and destination devices"); + } + copy_fn(dst_ptr, src_ptr, bytes); + } +} + } // namespace python } // namespace onnxruntime diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h index 144b3edcad404..097c5b4d20d65 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h @@ -138,6 +138,11 @@ pybind11::object GetPyObjFromTensor(const OrtValue& rtensor, const std::unordered_map* mem_cpy_to_host_functions = nullptr, bool zero_copy_non_owning = false); +// Update the tensor data in an OrtValue in-place from another OrtValue. +// Both OrtValues must contain tensors of the same data type and size. +// This function supports various device-to-device transfers. +void UpdateOrtValueInplace(OrtValue& dst, const OrtValue& src); + // The below two functions are used to convert OrtValue to numpy arrays /// diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc index eb966ac5fc314..168d57fc0827b 100644 --- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc @@ -237,6 +237,9 @@ void addOrtValueMethods(pybind11::module& m) { throw std::runtime_error("Unsupported device: Cannot update the OrtValue on this device"); } }) + .def("update_inplace", [](OrtValue* ml_value, const OrtValue& source) { + python::UpdateOrtValueInplace(*ml_value, source); + }) // Create an ortvalue value on top of the numpy array, but interpret the data // as a different type with the same element size. .def_static("ortvalue_from_numpy_with_onnx_type", [](py::array& data, int32_t onnx_element_type) -> std::unique_ptr { diff --git a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc index 1154f3b9f88b8..c30501c431a6c 100644 --- a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc +++ b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc @@ -95,25 +95,29 @@ void addSparseTensorMethods(pybind11::module& m) { py::class_(m, "SparseCooView") // Returns a numpy array of COO indices backed by Sparse Tensor memory // be aware that indices may reside on GPU if Sparse Tensor is on GPU - .def("indices", [](const PySparseCooView* view) -> py::array { + .def("indices", [](py::object self) -> py::array { + auto* view = self.cast(); const auto& indices = view->Indices(); - return MakeNumpyArrayFromIndices(indices, py::cast(*view)); + return MakeNumpyArrayFromIndices(indices, self); }); py::class_(m, "SparseCsrView") - .def("inner", [](const PySparseCsrView* view) -> py::array { + .def("inner", [](py::object self) -> py::array { + auto* view = self.cast(); const auto& indices = view->Inner(); - return MakeNumpyArrayFromIndices(indices, py::cast(*view)); + return MakeNumpyArrayFromIndices(indices, self); }) - .def("outer", [](const PySparseCsrView* view) -> py::array { + .def("outer", [](py::object self) -> py::array { + auto* view = self.cast(); const auto& indices = view->Outer(); - return MakeNumpyArrayFromIndices(indices, py::cast(*view)); + return MakeNumpyArrayFromIndices(indices, self); }); py::class_(m, "SparseBlockSparseView") - .def("indices", [](const PySparseBlockSparseView* view) -> py::array { + .def("indices", [](py::object self) -> py::array { + auto* view = self.cast(); const auto& indices = view->Indices(); - return MakeNumpyArrayFromIndices(indices, py::cast(*view)); + return MakeNumpyArrayFromIndices(indices, self); }); py::class_ sparse_bind(m, "SparseTensor"); @@ -296,7 +300,8 @@ void addSparseTensorMethods(pybind11::module& m) { }) // Returns a numpy array that is backed by SparseTensor values memory // be aware that it may be on GPU - .def("values", [](const PySparseTensor* py_tensor) -> py::array { + .def("values", [](py::object self) -> py::array { + auto* py_tensor = self.cast(); const SparseTensor& sparse_tensor = py_tensor->Instance(); if (sparse_tensor.Format() == SparseFormat::kUndefined) { ORT_THROW("This sparse tensor instance does not contain data"); @@ -311,7 +316,7 @@ void addSparseTensorMethods(pybind11::module& m) { auto dtype = t_disp.InvokeRet(); const auto& values = sparse_tensor.Values(); // See https://github.com/pybind/pybind11/issues/2271 - py::array result(dtype, values.Shape().GetDims(), values.DataRaw(), py::cast(*py_tensor)); + py::array result(dtype, values.Shape().GetDims(), values.DataRaw(), self); assert(!result.owndata()); // Set a read-only flag PyArray_CLEARFLAGS(reinterpret_cast(result.ptr()), NPY_ARRAY_WRITEABLE); diff --git a/onnxruntime/test/common/test_cpuinfo_sysfs_fallback.py b/onnxruntime/test/common/test_cpuinfo_sysfs_fallback.py new file mode 100644 index 0000000000000..12511512314a5 --- /dev/null +++ b/onnxruntime/test/common/test_cpuinfo_sysfs_fallback.py @@ -0,0 +1,563 @@ +#!/usr/bin/env python3 +""" +Simulation test for the cpuinfo sysfs fallback fix. + +This test verifies two fixes for https://github.com/microsoft/onnxruntime/issues/10038: + +1. Safe logging in env.cc - PosixEnv constructor no longer crashes when the + logging system is not yet initialized and cpuinfo_initialize() fails. + +2. cpuinfo sysfs fallback - The patched cpuinfo library falls back to + sysconf(_SC_NPROCESSORS_ONLN) for both processor counts and per-CPU + present/possible flags when /sys/devices/system/cpu/{possible,present} + files are missing. + +Testing approach: +- Test 1: Compile a small C++ program that calls the safe logging pattern + without a registered logger. Verify it doesn't crash. +- Test 2: Compile a small C program that validates the sysconf fallback + arithmetic and verifies that the fallback marks each online CPU with both + PRESENT and POSSIBLE flags. This catches the incomplete count-only fallback. +- Test 3: Use an LD_PRELOAD shim (like the lambda-arm64-onnx workaround) + to simulate missing sysfs files and verify ORT loads without crash. + +Note: Tests 2 and 3 require a build of ORT with the patches applied. +Test 1 can run standalone. +""" + +import os +import shutil +import subprocess +import sys +import tempfile +import textwrap +import unittest + + +def _require_linux(): + if sys.platform != "linux": + raise unittest.SkipTest("Test requires Linux") + + +def _require_gcc(): + if not shutil.which("gcc"): + raise unittest.SkipTest("gcc not found") + + +def _require_gpp(): + if not shutil.which("g++"): + raise unittest.SkipTest("g++ not found") + + +class TestCpuinfoSysfsFallback(unittest.TestCase): + def test_safe_logging_pattern(self): + """Verify the safe logging pattern doesn't crash when no logger exists. + + This simulates the fix in env.cc where we check HasDefaultLogger() before + calling LOGS_DEFAULT(). We compile a minimal C++ program that: + - Does NOT register a default logger + - Calls the safe logging pattern + - Verifies it writes to stderr instead of crashing + """ + _require_linux() + _require_gpp() + + source = textwrap.dedent(r""" + #include + #include + + // Minimal simulation of ORT's logging check pattern + namespace logging { + class LoggingManager { + public: + // Simulate: no default logger registered + static bool HasDefaultLogger() { return false; } + }; + } // namespace logging + + void LogEarlyWarning(std::string_view message) { + if (logging::LoggingManager::HasDefaultLogger()) { + // Would call LOGS_DEFAULT(WARNING) here - but logger doesn't exist + // This path should NOT be taken + std::cerr << "BUG: should not reach here\n"; + return; + } + // Safe fallback to stderr + std::cerr << "onnxruntime warning: " << message << "\n"; + } + + int main() { + // This simulates what PosixEnv() does when cpuinfo_initialize() fails + bool cpuinfo_available = false; // Simulating failure + if (!cpuinfo_available) { + LogEarlyWarning("cpuinfo_initialize failed. " + "May cause CPU EP performance degradation due to undetected CPU features."); + } + std::cout << "PASS: Safe logging pattern works without crash\n"; + return 0; + } + """) + + with tempfile.NamedTemporaryFile(suffix=".cc", mode="w", delete=False) as f: + f.write(source) + src_path = f.name + + try: + exe_path = src_path.replace(".cc", "") + result = subprocess.run( + ["g++", "-std=c++17", "-o", exe_path, src_path], check=False, capture_output=True, text=True + ) + self.assertEqual(result.returncode, 0, f"Compilation failed: {result.stderr}") + + result = subprocess.run([exe_path], check=False, capture_output=True, text=True, timeout=10) + self.assertEqual( + result.returncode, 0, f"Program crashed with exit code {result.returncode}: {result.stderr}" + ) + self.assertIn("PASS", result.stdout) + finally: + os.unlink(src_path) + if os.path.exists(src_path.replace(".cc", "")): + os.unlink(src_path.replace(".cc", "")) + + def test_sysconf_fallback(self): + """Verify sysconf(_SC_NPROCESSORS_ONLN) works as a complete fallback. + + This doesn't test the actual cpuinfo patch (that requires building cpuinfo) + but verifies the fallback mechanism produces correct counts and marks + present/possible flags for each online CPU. + """ + _require_linux() + _require_gcc() + + source = textwrap.dedent(r""" + #include + #include + #include + + #define CPUINFO_LINUX_FLAG_PRESENT 0x1 + #define CPUINFO_LINUX_FLAG_POSSIBLE 0x2 + + int main() { + long nproc = sysconf(_SC_NPROCESSORS_ONLN); + if (nproc <= 0) { + printf("FAIL: sysconf(_SC_NPROCESSORS_ONLN) returned %ld\n", nproc); + return 1; + } + // Simulate what the patched cpuinfo max-count helpers return: + // max_processor = nproc - 1 (0-indexed). Then arm_linux_init does: + // 1 + max_processor = nproc. + unsigned int max_processor = (unsigned int)(nproc - 1); + unsigned int arm_linux_processors_count = 1 + max_processor; + + uint32_t processor_flags[1024] = {0}; + unsigned int processors_count = arm_linux_processors_count; + if (processors_count > 1024) { + processors_count = 1024; + } + + // Simulate cpuinfo_linux_detect_possible_processors() and + // cpuinfo_linux_detect_present_processors() fallback helpers. + for (unsigned int processor = 0; processor < processors_count; ++processor) { + processor_flags[processor] |= CPUINFO_LINUX_FLAG_PRESENT; + processor_flags[processor] |= CPUINFO_LINUX_FLAG_POSSIBLE; + } + + unsigned int valid_processors = 0; + const uint32_t valid_processor_mask = CPUINFO_LINUX_FLAG_PRESENT | CPUINFO_LINUX_FLAG_POSSIBLE; + for (unsigned int processor = 0; processor < processors_count; ++processor) { + if ((processor_flags[processor] & valid_processor_mask) == valid_processor_mask) { + ++valid_processors; + } + } + + printf("sysconf(_SC_NPROCESSORS_ONLN) = %ld\n", nproc); + printf("Simulated max_processor = %u\n", max_processor); + printf("Simulated arm_linux_processors_count = %u\n", arm_linux_processors_count); + printf("Simulated valid_processors = %u\n", valid_processors); + + if (arm_linux_processors_count == (unsigned int)nproc && valid_processors == processors_count) { + printf("PASS: Fallback produces correct processor count and flags\n"); + return 0; + } + printf("FAIL: Processor count or flags mismatch\n"); + return 1; + } + """) + + with tempfile.NamedTemporaryFile(suffix=".c", mode="w", delete=False) as f: + f.write(source) + src_path = f.name + + try: + exe_path = src_path.replace(".c", "") + result = subprocess.run(["gcc", "-o", exe_path, src_path], check=False, capture_output=True, text=True) + self.assertEqual(result.returncode, 0, f"Compilation failed: {result.stderr}") + + result = subprocess.run([exe_path], check=False, capture_output=True, text=True, timeout=10) + self.assertEqual(result.returncode, 0, f"exit code {result.returncode}: {result.stdout}") + self.assertIn("PASS", result.stdout) + finally: + os.unlink(src_path) + if os.path.exists(src_path.replace(".c", "")): + os.unlink(src_path.replace(".c", "")) + + def test_sysfs_hide_with_ld_preload(self): + """Verify LD_PRELOAD shim can hide sysfs files. + + This compiles a small shim that intercepts open-family calls to return + ENOENT for /sys/devices/system/cpu/{possible,present}, then runs a test + program that opens those files. + """ + _require_linux() + _require_gcc() + + shim_source = textwrap.dedent(r""" + #define _GNU_SOURCE + #include + #include + #include + #include + #include + #include + #include + +#ifndef O_TMPFILE +#define O_TMPFILE 0 +#endif + + static const char *CPU_POSSIBLE = "/sys/devices/system/cpu/possible"; + static const char *CPU_PRESENT = "/sys/devices/system/cpu/present"; + + static int is_blocked(const char *path) { + return (strcmp(path, CPU_POSSIBLE) == 0 || strcmp(path, CPU_PRESENT) == 0); + } + + static mode_t get_mode_if_needed(int flags, va_list args) { + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) ? va_arg(args, mode_t) : 0; + } + + int open(const char *path, int flags, ...) { + static int (*real_open)(const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_open) real_open = dlsym(RTLD_NEXT, "open"); + if (is_blocked(path)) { + errno = ENOENT; + return -1; + } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_open(path, flags, mode) + : real_open(path, flags); + } + + int open64(const char *path, int flags, ...) { + static int (*real_open64)(const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_open64) real_open64 = dlsym(RTLD_NEXT, "open64"); + if (is_blocked(path)) { + errno = ENOENT; + return -1; + } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_open64(path, flags, mode) + : real_open64(path, flags); + } + + int openat(int dirfd, const char *path, int flags, ...) { + static int (*real_openat)(int, const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_openat) real_openat = dlsym(RTLD_NEXT, "openat"); + if (path && is_blocked(path)) { + errno = ENOENT; + return -1; + } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_openat(dirfd, path, flags, mode) + : real_openat(dirfd, path, flags); + } + + int openat64(int dirfd, const char *path, int flags, ...) { + static int (*real_openat64)(int, const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_openat64) real_openat64 = dlsym(RTLD_NEXT, "openat64"); + if (path && is_blocked(path)) { + errno = ENOENT; + return -1; + } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_openat64(dirfd, path, flags, mode) + : real_openat64(dirfd, path, flags); + } + + FILE *fopen(const char *restrict path, const char *restrict mode) { + static FILE *(*real_fopen)(const char *, const char *) = NULL; + if (!real_fopen) real_fopen = dlsym(RTLD_NEXT, "fopen"); + + if (is_blocked(path)) { + errno = ENOENT; + return NULL; + } + return real_fopen(path, mode); + } + """) + + test_source = textwrap.dedent(r""" + #include + #include + #include + #include + #include + + static int try_open(const char *path) { + int fd = open(path, O_RDONLY); + if (fd >= 0) { + close(fd); + } + return fd; + } + + int main() { + int fd; + int pass = 1; + + fd = try_open("/sys/devices/system/cpu/possible"); + if (fd >= 0) { + printf("FAIL: /sys/devices/system/cpu/possible should be blocked\n"); + pass = 0; + } else { + printf("OK: /sys/devices/system/cpu/possible blocked (errno=%d: %s)\n", + errno, strerror(errno)); + } + + fd = try_open("/sys/devices/system/cpu/present"); + if (fd >= 0) { + printf("FAIL: /sys/devices/system/cpu/present should be blocked\n"); + pass = 0; + } else { + printf("OK: /sys/devices/system/cpu/present blocked (errno=%d: %s)\n", + errno, strerror(errno)); + } + + // Verify other files still work + fd = try_open("/proc/cpuinfo"); + if (fd < 0) { + printf("WARN: /proc/cpuinfo not accessible (may be OK in some envs)\n"); + } else { + printf("OK: /proc/cpuinfo still accessible\n"); + } + + if (pass) { + printf("PASS: LD_PRELOAD sysfs-hiding shim works correctly\n"); + } + return pass ? 0 : 1; + } + """) + + with tempfile.TemporaryDirectory() as tmpdir: + shim_path = os.path.join(tmpdir, "hide_sysfs.c") + shim_so = os.path.join(tmpdir, "hide_sysfs.so") + test_path = os.path.join(tmpdir, "test_sysfs.c") + test_exe = os.path.join(tmpdir, "test_sysfs") + + with open(shim_path, "w") as f: + f.write(shim_source) + with open(test_path, "w") as f: + f.write(test_source) + + # Compile shim + result = subprocess.run( + ["gcc", "-shared", "-fPIC", "-o", shim_so, shim_path, "-ldl"], + check=False, + capture_output=True, + text=True, + ) + self.assertEqual(result.returncode, 0, f"Shim compilation failed: {result.stderr}") + + # Compile test + result = subprocess.run(["gcc", "-o", test_exe, test_path], check=False, capture_output=True, text=True) + self.assertEqual(result.returncode, 0, f"Test compilation failed: {result.stderr}") + + # Run with LD_PRELOAD + env = os.environ.copy() + env["LD_PRELOAD"] = shim_so + result = subprocess.run([test_exe], check=False, capture_output=True, text=True, timeout=10, env=env) + self.assertEqual(result.returncode, 0, f"exit code {result.returncode}: {result.stdout}") + self.assertIn("PASS", result.stdout) + + def test_ort_import_with_hidden_sysfs(self): + """Integration test - import onnxruntime with hidden sysfs files. + + This uses the LD_PRELOAD shim to hide /sys/devices/system/cpu/{possible,present} + and then imports onnxruntime. This is the actual end-to-end test that + verifies both fixes work together. + + NOTE: This requires onnxruntime to be built with the patches applied. + """ + _require_linux() + _require_gcc() + + # Check if onnxruntime is importable + result = subprocess.run( + [sys.executable, "-c", "import onnxruntime"], check=False, capture_output=True, text=True, timeout=30 + ) + if result.returncode != 0: + self.skipTest("onnxruntime not installed/importable") + + shim_source = textwrap.dedent(r""" + #define _GNU_SOURCE + #include + #include + #include + #include + #include + #include + #include + +#ifndef O_TMPFILE +#define O_TMPFILE 0 +#endif + + static const char *CPU_POSSIBLE = "/sys/devices/system/cpu/possible"; + static const char *CPU_PRESENT = "/sys/devices/system/cpu/present"; + + static int is_blocked(const char *path) { + return (strcmp(path, CPU_POSSIBLE) == 0 || strcmp(path, CPU_PRESENT) == 0); + } + + static mode_t get_mode_if_needed(int flags, va_list args) { + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) ? va_arg(args, mode_t) : 0; + } + + int open(const char *path, int flags, ...) { + static int (*real_open)(const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_open) real_open = dlsym(RTLD_NEXT, "open"); + if (is_blocked(path)) { errno = ENOENT; return -1; } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_open(path, flags, mode) + : real_open(path, flags); + } + + int open64(const char *path, int flags, ...) { + static int (*real_open64)(const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_open64) real_open64 = dlsym(RTLD_NEXT, "open64"); + if (is_blocked(path)) { errno = ENOENT; return -1; } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_open64(path, flags, mode) + : real_open64(path, flags); + } + + int openat(int dirfd, const char *path, int flags, ...) { + static int (*real_openat)(int, const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_openat) real_openat = dlsym(RTLD_NEXT, "openat"); + if (path && is_blocked(path)) { errno = ENOENT; return -1; } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_openat(dirfd, path, flags, mode) + : real_openat(dirfd, path, flags); + } + + int openat64(int dirfd, const char *path, int flags, ...) { + static int (*real_openat64)(int, const char *, int, ...) = NULL; + va_list args; + mode_t mode = 0; + + if (!real_openat64) real_openat64 = dlsym(RTLD_NEXT, "openat64"); + if (path && is_blocked(path)) { errno = ENOENT; return -1; } + + va_start(args, flags); + mode = get_mode_if_needed(flags, args); + va_end(args); + return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) + ? real_openat64(dirfd, path, flags, mode) + : real_openat64(dirfd, path, flags); + } + + FILE *fopen(const char *restrict path, const char *restrict mode) { + static FILE *(*real_fopen)(const char *, const char *) = NULL; + if (!real_fopen) real_fopen = dlsym(RTLD_NEXT, "fopen"); + if (is_blocked(path)) { errno = ENOENT; return NULL; } + return real_fopen(path, mode); + } + """) + + with tempfile.TemporaryDirectory() as tmpdir: + shim_path = os.path.join(tmpdir, "hide_sysfs.c") + shim_so = os.path.join(tmpdir, "hide_sysfs.so") + + with open(shim_path, "w") as f: + f.write(shim_source) + + result = subprocess.run( + ["gcc", "-shared", "-fPIC", "-o", shim_so, shim_path, "-ldl"], + check=False, + capture_output=True, + text=True, + ) + self.assertEqual(result.returncode, 0, f"Shim compilation failed: {result.stderr}") + + env = os.environ.copy() + env["LD_PRELOAD"] = shim_so + + # Try importing onnxruntime with hidden sysfs + ort_script = ( + "import onnxruntime; print('PASS: onnxruntime imported successfully'); " + "print(f'Version: {onnxruntime.__version__}'); " + "print(f'Providers: {onnxruntime.get_available_providers()}')" + ) + result = subprocess.run( + [sys.executable, "-c", ort_script], + check=False, + capture_output=True, + text=True, + timeout=60, + env=env, + ) + self.assertEqual(result.returncode, 0, f"exit code {result.returncode}: {result.stderr}") + self.assertIn("PASS", result.stdout) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc index 1fc410c37da14..880c10137f3fe 100644 --- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc +++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc @@ -937,10 +937,11 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_CUDA_Passthroug test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); // position_id = 2048 exceeds max_sequence_length = 8 — CUDA should pass through input unchanged. test.AddInput("position_ids", {batch_size, sequence_length}, {2048}); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, - std::vector(max_sequence_length * head_size / 2, 1.0f)); + std::vector(max_sequence_length * head_size / 2, 0.5f)); test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, - std::vector(max_sequence_length * head_size / 2, 0.0f)); + std::vector(max_sequence_length * head_size / 2, 0.866f)); // Output should equal input when position_id is OOB (pass-through). test.AddOutput("output", {batch_size, sequence_length, hidden_size}, input_data); @@ -1054,5 +1055,122 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_RejectsRank4MalformedCacheWidth {}, nullptr, &execution_providers); } +// Test that OOB position_ids on WebGPU (format 1) pass through input unchanged (shader-side defense). +TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_WebGPU_Passthrough) { + if (nullptr == DefaultWebGpuExecutionProvider().get()) { + GTEST_SKIP() << "WebGPU execution provider is not available."; + } + + int batch_size = 1; + int sequence_length = 2; + int num_heads = 2; + int head_size = 4; + int max_sequence_length = 8; + int hidden_size = num_heads * head_size; + + OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain); + test.AddAttribute("interleaved", static_cast(0)); + + std::vector input_data(batch_size * sequence_length * hidden_size); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(i + 1); + } + + test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); + // Both position_ids exceed max_sequence_length = 8 — shader passes through input unchanged. + test.AddInput("position_ids", {batch_size, sequence_length}, {999, 999}); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. + test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.5f)); + test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.866f)); + + // Output should equal input when position_id is OOB (pass-through). + test.AddOutput("output", {batch_size, sequence_length, hidden_size}, input_data); + test.SetOutputAbsErr("output", 0.0f); + + std::vector> execution_providers; + execution_providers.push_back(DefaultWebGpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +// Test that format-0 OOB position_ids base offset passes through on WebGPU (shader-side defense). +TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Format0_OOB_WebGPU_Passthrough) { + if (nullptr == DefaultWebGpuExecutionProvider().get()) { + GTEST_SKIP() << "WebGPU execution provider is not available."; + } + + int batch_size = 1; + int sequence_length = 2; + int num_heads = 2; + int head_size = 4; + int max_sequence_length = 8; + int hidden_size = num_heads * head_size; + + OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain); + test.AddAttribute("interleaved", static_cast(0)); + + std::vector input_data(batch_size * sequence_length * hidden_size); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(i + 1); + } + + test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); + // Format 0: base offset 8, effective positions = [8, 9] — both OOB for max_sequence_length = 8. + test.AddInput("position_ids", {1}, {8}); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. + test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.5f)); + test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.866f)); + + // Output should equal input when all positions are OOB (pass-through). + test.AddOutput("output", {batch_size, sequence_length, hidden_size}, input_data); + test.SetOutputAbsErr("output", 0.0f); + + std::vector> execution_providers; + execution_providers.push_back(DefaultWebGpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +// Test that negative position_ids pass through on WebGPU (shader-side defense catches raw_pos < 0). +TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Negative_WebGPU_Passthrough) { + if (nullptr == DefaultWebGpuExecutionProvider().get()) { + GTEST_SKIP() << "WebGPU execution provider is not available."; + } + + int batch_size = 1; + int sequence_length = 1; + int num_heads = 2; + int head_size = 4; + int max_sequence_length = 8; + int hidden_size = num_heads * head_size; + + OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain); + test.AddAttribute("interleaved", static_cast(0)); + + std::vector input_data(hidden_size); + for (int i = 0; i < hidden_size; ++i) { + input_data[i] = static_cast(i + 1); + } + + test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); + // Negative position_id — shader checks raw_pos < 0 and passes through. + test.AddInput("position_ids", {batch_size, sequence_length}, {-5}); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. + test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.5f)); + test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.866f)); + + // Output should equal input when position_id is negative (pass-through). + test.AddOutput("output", {batch_size, sequence_length, hidden_size}, input_data); + test.SetOutputAbsErr("output", 0.0f); + + std::vector> execution_providers; + execution_providers.push_back(DefaultWebGpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index f97fefb085d84..59ec8f51b4f4e 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -2294,6 +2294,251 @@ TEST(SparseTensorConversionTests, SparseTensorProtoToDense_ValuesSizeMismatch_Ra EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("values data size does not match expected")); } +// Tests for SparseTensorProtoToDenseTensorProto with negative indices (model-loading path) +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_NegativeIndex_Rank1) { + // Dense size 4 + // Index -1 -> negative, out of bounds + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.mutable_values()->set_name("test_neg_idx"); + sparse.add_dims(4); + + auto* val = sparse.mutable_values(); + val->add_dims(1); + val->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + val->add_float_data(1.0f); + + auto* ind = sparse.mutable_indices(); + ind->add_dims(1); + ind->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + ind->add_int64_data(-1); + + ONNX_NAMESPACE::TensorProto dense; + auto status = utils::SparseTensorProtoToDenseTensorProto(sparse, {}, dense); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("index is out of bounds")); +} + +TEST(SparseTensorConversionTests, SparseTensorProtoToDense_NegativeIndex_Rank2) { + // Dense Shape [3, 3] + // Index [-1, 0] -> negative row, out of bounds + ONNX_NAMESPACE::SparseTensorProto sparse; + sparse.mutable_values()->set_name("test_neg_idx_2d"); + sparse.add_dims(3); + sparse.add_dims(3); + + auto* val = sparse.mutable_values(); + val->add_dims(1); + val->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + val->add_float_data(1.0f); + + auto* ind = sparse.mutable_indices(); + ind->add_dims(1); + ind->add_dims(2); + ind->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); + ind->add_int64_data(-1); + ind->add_int64_data(0); + + ONNX_NAMESPACE::TensorProto dense; + auto status = utils::SparseTensorProtoToDenseTensorProto(sparse, {}, dense); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("index is out of bounds")); +} + +// Tests for SparseCooToDenseTensor and SparseCsrToDenseTensor (sparse_utils.cc paths) +TEST(SparseTensorConversionTests, SparseCooToDense_NegativeLinearIndex) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + // Create a SparseTensor with COO format and a negative linear index + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {1, 2, 3}; + std::vector bad_indices = {-1, 3, 5}; // -1 is invalid + + ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), gsl::make_span(bad_indices))); + + Tensor dense_dst; + auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO index")); +} + +TEST(SparseTensorConversionTests, SparseCooToDense_UpperBoundLinearIndex) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + // Dense 3x3 = 9 elements. Index 9 is out of bounds (valid: 0-8) + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {1, 2, 3}; + std::vector bad_indices = {0, 3, 9}; + + ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), gsl::make_span(bad_indices))); + + Tensor dense_dst; + auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO index")); +} + +TEST(SparseTensorConversionTests, SparseCooToDense_Negative2DIndex) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + // 2D indices: (-1, 0) is invalid + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {1, 2}; + std::vector bad_indices = {-1, 0, 1, 1}; // 2D, first entry has negative row + + ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), gsl::make_span(bad_indices))); + + Tensor dense_dst; + auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO 2D index")); +} + +TEST(SparseTensorConversionTests, SparseCsrToDense_NegativeColumnIndex) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + // 3x3 dense, CSR with a negative column index + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {1, 2, 3}; + std::vector inner = {-1, 0, 2}; // -1 is invalid column + std::vector outer = {0, 1, 2, 3}; + + ASSERT_STATUS_OK(src.MakeCsrData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), + gsl::make_span(inner), gsl::make_span(outer))); + + Tensor dense_dst; + auto status = sparse_utils::SparseCsrToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid CSR column index")); +} + +TEST(SparseTensorConversionTests, SparseCsrToDense_ColumnIndexOutOfBounds) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + // 3x3 dense, CSR with column index 3 (valid: 0-2) + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {1, 2, 3}; + std::vector inner = {1, 3, 1}; // 3 is out of bounds for 3 columns + std::vector outer = {0, 1, 2, 3}; + + ASSERT_STATUS_OK(src.MakeCsrData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), + gsl::make_span(inner), gsl::make_span(outer))); + + Tensor dense_dst; + auto status = sparse_utils::SparseCsrToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid CSR column index")); +} + +// Regression test: SparseCsrToDenseTensor must use correct source index for each +// non-zero value. Previously src_idx was never incremented, so all entries got values[0]. +// Using distinct values exposes this bug. +TEST(SparseTensorConversionTests, SparseCsrToDense_DistinctValuesRoundtrip) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + // 3x3 dense matrix: + // 0 0 10 + // 20 0 30 + // 0 0 0 + // CSR: values={10, 20, 30}, inner(col)={2, 0, 2}, outer={0, 1, 3, 3} + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {10, 20, 30}; + std::vector inner = {2, 0, 2}; + std::vector outer = {0, 1, 3, 3}; + + ASSERT_STATUS_OK(src.MakeCsrData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), + gsl::make_span(inner), gsl::make_span(outer))); + + Tensor dense_dst; + ASSERT_STATUS_OK(sparse_utils::SparseCsrToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst)); + + std::vector expected_dense = { + 0, 0, 10, + 20, 0, 30, + 0, 0, 0}; + + auto dense_span = dense_dst.DataAsSpan(); + ASSERT_EQ(dense_span.size(), expected_dense.size()); + for (size_t i = 0; i < expected_dense.size(); ++i) { + EXPECT_EQ(dense_span[i], expected_dense[i]) << "Mismatch at index " << i; + } +} + +// Test that COO 2D validation catches out-of-range column even when +// the linearized index would be in bounds. E.g., for a 3x3 matrix, +// (row=0, col=4) gives linear index 4 which is in [0,9), but col=4 >= cols=3. +TEST(SparseTensorConversionTests, SparseCooToDense_2DColumnOutOfRange) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {1}; + // (row=0, col=4): linear index = 0*3+4 = 4, valid linear but col >= cols + std::vector bad_indices = {0, 4}; + + ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), gsl::make_span(bad_indices))); + + Tensor dense_dst; + auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO 2D index")); +} + +// Test that COO 2D validation catches out-of-range row. +TEST(SparseTensorConversionTests, SparseCooToDense_2DRowOutOfRange) { + auto* cpu_provider = TestCPUExecutionProvider(); + auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0]; + + DataTransferManager dtm; + ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer())); + + SparseTensor src(DataTypeImpl::GetType(), TensorShape{3, 3}, cpu_allocator); + std::vector values = {1}; + // (row=3, col=0): row >= rows + std::vector bad_indices = {3, 0}; + + ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(), + values.size(), values.data(), gsl::make_span(bad_indices))); + + Tensor dense_dst; + auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst); + EXPECT_FALSE(status.IsOK()); + EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO 2D index")); +} + #endif // !defined(DISABLE_SPARSE_TENSORS) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/mlas/bench/riscv64/README.md b/onnxruntime/test/mlas/bench/riscv64/README.md new file mode 100644 index 0000000000000..136c40d39430f --- /dev/null +++ b/onnxruntime/test/mlas/bench/riscv64/README.md @@ -0,0 +1,77 @@ +# RISC-V MLAS Benchmarks + +This directory stores the standalone benchmarks and compare tools used while +bringing up and tuning the RVV path in MLAS. + +Files: + +- `sgemm_riscv_bench.cpp`: standalone SGEMM timing harness with checksum + output. Useful for RVV versus scalar comparisons. +- `softmax_rvv_compare.cpp`: scalar versus RVV validation and timing tool for + the Softmax critical path. + +These tools are intentionally kept separate from `onnxruntime_mlas_benchmark`. +Each source file has its own `main()` and is built as an independent target. + +## Build + +On a riscv64 RVV build, first regenerate the build tree: + +```bash +python3 tools/ci_build/build.py \ + --config Release \ + --build_dir build/k1_rvv_resync \ + --update \ + --skip_tests \ + --skip_pip_install \ + --skip_submodule_sync \ + --no_sve \ + --enable_rvv +``` + +Then build both standalone tools directly with CMake: + +```bash +cmake --build build/k1_rvv_resync/Release \ + --config Release \ + --target onnxruntime_mlas_sgemm_riscv_bench onnxruntime_mlas_softmax_riscv_compare \ + -- -j8 +``` + +The resulting binaries are typically placed under: + +```bash +build/k1_rvv_resync/Release/onnxruntime_mlas_sgemm_riscv_bench +build/k1_rvv_resync/Release/onnxruntime_mlas_softmax_riscv_compare +``` + +## SGEMM examples + +RVV, packed-B: + +```bash +taskset -c 0 build/k1_rvv_resync/Release/onnxruntime_mlas_sgemm_riscv_bench \ + --m=128 --n=3072 --k=768 --iters=10 --warmup=3 --pack_b=1 --trans_a=0 --trans_b=0 +``` + +Scalar baseline on the same binary: + +```bash +ORT_MLAS_RISCV_FORCE_SCALAR=1 taskset -c 0 \ + build/k1_rvv_resync/Release/onnxruntime_mlas_sgemm_riscv_bench \ + --m=128 --n=3072 --k=768 --iters=10 --warmup=3 --pack_b=1 --trans_a=0 --trans_b=0 +``` + +## Softmax examples + +```bash +taskset -c 0 build/k1_rvv_resync/Release/onnxruntime_mlas_softmax_riscv_compare +``` + +## Notes + +- The RVV SGEMM path is written to be VLEN-agnostic. The MLAS packing format + remains 16 columns wide, but each tile is consumed using runtime `vsetvl` + chunking so the same binary works across different VLENs such as 128 and 256. +- `ORT_MLAS_RISCV_FORCE_SCALAR=1` disables the RVV dispatch at runtime and is + the preferred way to gather scalar baselines from the same build. diff --git a/onnxruntime/test/mlas/bench/riscv64/sgemm_riscv_bench.cpp b/onnxruntime/test/mlas/bench/riscv64/sgemm_riscv_bench.cpp new file mode 100644 index 0000000000000..d94840ffec518 --- /dev/null +++ b/onnxruntime/test/mlas/bench/riscv64/sgemm_riscv_bench.cpp @@ -0,0 +1,240 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + sgemm_riscv_bench.cpp + +Abstract: + + This module implements a standalone SGEMM benchmark used while tuning the + RISC-V MLAS path. It is intentionally separate from the Google Benchmark + suite so it can print pack time, compute time, checksum, and compare RVV + against scalar execution via ORT_MLAS_RISCV_FORCE_SCALAR. + +--*/ + +#include "mlas.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +struct Options { + size_t m = 128; + size_t n = 3072; + size_t k = 768; + size_t iters = 20; + size_t warmup = 3; + bool pack_b = false; + bool trans_a = false; + bool trans_b = false; + float alpha = 1.0f; + float beta = 0.0f; +}; + +void PrintUsage(const char* argv0) { + std::cout + << "Usage: " << argv0 << " [--m=N] [--n=N] [--k=N] [--iters=N] [--warmup=N]\n" + << " [--pack_b=0|1] [--trans_a=0|1] [--trans_b=0|1]\n" + << " [--alpha=F] [--beta=F]\n"; +} + +bool ParseBool(std::string_view value) { + return value == "1" || value == "true" || value == "on" || value == "yes"; +} + +float MakeValue(size_t index) { + uint32_t x = static_cast(index * 747796405u + 2891336453u); + x ^= x >> 16; + x *= 2246822519u; + x ^= x >> 13; + const uint32_t bucket = x % 2048u; + return (static_cast(bucket) / 1024.0f) - 1.0f; +} + +Options ParseArgs(int argc, char** argv) { + Options options; + + for (int i = 1; i < argc; ++i) { + std::string_view arg(argv[i]); + if (arg == "--help" || arg == "-h") { + PrintUsage(argv[0]); + std::exit(0); + } + + const auto split = arg.find('='); + if (split == std::string_view::npos || split == 0 || split + 1 >= arg.size()) { + continue; + } + + const std::string_view key = arg.substr(0, split); + const std::string_view value = arg.substr(split + 1); + + if (key == "--m") { + options.m = std::strtoull(value.data(), nullptr, 10); + } else if (key == "--n") { + options.n = std::strtoull(value.data(), nullptr, 10); + } else if (key == "--k") { + options.k = std::strtoull(value.data(), nullptr, 10); + } else if (key == "--iters") { + options.iters = std::strtoull(value.data(), nullptr, 10); + } else if (key == "--warmup") { + options.warmup = std::strtoull(value.data(), nullptr, 10); + } else if (key == "--pack_b") { + options.pack_b = ParseBool(value); + } else if (key == "--trans_a") { + options.trans_a = ParseBool(value); + } else if (key == "--trans_b") { + options.trans_b = ParseBool(value); + } else if (key == "--alpha") { + options.alpha = std::strtof(value.data(), nullptr); + } else if (key == "--beta") { + options.beta = std::strtof(value.data(), nullptr); + } + } + + return options; +} + +template +double TimeLoop(size_t iterations, Fn&& fn) { + const auto begin = std::chrono::steady_clock::now(); + for (size_t i = 0; i < iterations; ++i) { + fn(); + } + const auto end = std::chrono::steady_clock::now(); + return std::chrono::duration(end - begin).count(); +} + +} // namespace + +int main(int argc, char** argv) { + const Options options = ParseArgs(argc, argv); + + if (options.m == 0 || options.n == 0 || options.k == 0 || options.iters == 0) { + std::cerr << "m, n, k, and iters must be > 0" << std::endl; + return 1; + } + + const size_t a_size = options.m * options.k; + const size_t b_size = options.n * options.k; + const size_t c_size = options.m * options.n; + + std::vector a(a_size); + std::vector b(b_size); + std::vector c(c_size, 0.0f); + + for (size_t i = 0; i < a.size(); ++i) { + a[i] = MakeValue(i); + } + for (size_t i = 0; i < b.size(); ++i) { + b[i] = MakeValue(i + a.size()); + } + + const CBLAS_TRANSPOSE trans_a = options.trans_a ? CblasTrans : CblasNoTrans; + const CBLAS_TRANSPOSE trans_b = options.trans_b ? CblasTrans : CblasNoTrans; + const size_t lda = options.trans_a ? options.m : options.k; + const size_t ldb = options.trans_b ? options.k : options.n; + const size_t ldc = options.n; + + std::vector packed_b; + double pack_ms = 0.0; + + if (options.pack_b) { + const size_t packed_b_size = MlasGemmPackBSize(trans_a, trans_b, options.n, options.k, nullptr); + if (packed_b_size == 0) { + std::cerr << "packing is not supported for this configuration" << std::endl; + return 2; + } + + packed_b.resize(packed_b_size); + + pack_ms = TimeLoop(options.iters, [&]() { + MlasGemmPackB(trans_a, trans_b, options.n, options.k, b.data(), ldb, packed_b.data(), nullptr); + }); + + MlasGemmPackB(trans_a, trans_b, options.n, options.k, b.data(), ldb, packed_b.data(), nullptr); + } + + auto run_once = [&]() { + if (options.beta == 0.0f) { + std::fill(c.begin(), c.end(), 0.0f); + } + + if (options.pack_b) { + MlasGemm( + trans_a, + options.m, + options.n, + options.k, + options.alpha, + a.data(), + lda, + packed_b.data(), + options.beta, + c.data(), + ldc, + nullptr, + nullptr); + } else { + MlasGemm( + trans_a, + trans_b, + options.m, + options.n, + options.k, + options.alpha, + a.data(), + lda, + b.data(), + ldb, + options.beta, + c.data(), + ldc, + nullptr, + nullptr); + } + }; + + for (size_t i = 0; i < options.warmup; ++i) { + run_once(); + } + + const double compute_ms = TimeLoop(options.iters, run_once); + const double avg_compute_ms = compute_ms / static_cast(options.iters); + const double avg_pack_ms = pack_ms / static_cast(options.iters); + const double flops = 2.0 * static_cast(options.m) * static_cast(options.n) * + static_cast(options.k); + const double gflops = flops / (avg_compute_ms * 1.0e6); + const double checksum = std::accumulate(c.begin(), c.end(), 0.0); + + std::cout << std::fixed << std::setprecision(4); + std::cout << "M=" << options.m + << " N=" << options.n + << " K=" << options.k + << " pack_b=" << (options.pack_b ? 1 : 0) + << " trans_a=" << (options.trans_a ? 1 : 0) + << " trans_b=" << (options.trans_b ? 1 : 0) + << " iters=" << options.iters + << " warmup=" << options.warmup << '\n'; + if (options.pack_b) { + std::cout << "pack_total_ms=" << pack_ms << " pack_avg_ms=" << avg_pack_ms << '\n'; + } + std::cout << "compute_total_ms=" << compute_ms + << " compute_avg_ms=" << avg_compute_ms + << " gflops=" << gflops << '\n'; + std::cout << "checksum=" << checksum << std::endl; + + return 0; +} diff --git a/onnxruntime/test/mlas/bench/riscv64/softmax_rvv_compare.cpp b/onnxruntime/test/mlas/bench/riscv64/softmax_rvv_compare.cpp new file mode 100644 index 0000000000000..e4411d3920408 --- /dev/null +++ b/onnxruntime/test/mlas/bench/riscv64/softmax_rvv_compare.cpp @@ -0,0 +1,241 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + softmax_rvv_compare.cpp + +Abstract: + + This module implements a standalone RVV versus scalar validation and + timing tool for the Softmax critical path on riscv64. + +--*/ + +#include "mlas.h" + +#include + +#if !defined(MLAS_TARGET_RISCV64) + +int main() { + std::cout << "softmax_rvv_compare is only supported on riscv64." << std::endl; + return 0; +} + +#elif !defined(MLAS_USE_RVV) + +int main() { + std::cout << "softmax_rvv_compare requires an RVV-enabled MLAS build." << std::endl; + return 0; +} + +#else + +#include "core/mlas/lib/mlasi.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +struct CompareStats { + float max_abs_diff = 0.0f; + float max_rel_diff = 0.0f; + double checksum_scalar = 0.0; + double checksum_rvv = 0.0; +}; + +struct TimingStats { + double scalar_ms = 0.0; + double rvv_ms = 0.0; +}; + +void ScalarSoftmaxRow(const float* input, float* output, size_t d, bool log_softmax, bool smooth_softmax) { + float maximum = MlasReduceMaximumF32Kernel(input, d); + if (smooth_softmax && maximum < 0.0f) { + maximum = 0.0f; + } + + const float negative_maximum = -maximum; + + if (log_softmax) { + float accumulation = MlasComputeSumExpF32Kernel(input, nullptr, d, &negative_maximum); + if (smooth_softmax) { + accumulation += std::exp(-maximum); + } + + const float parameters[2] = {negative_maximum, std::log(accumulation)}; + MlasComputeLogSoftmaxOutputF32Kernel(input, output, d, parameters); + return; + } + + float accumulation = MlasComputeSumExpF32Kernel(input, output, d, &negative_maximum); + if (smooth_softmax) { + accumulation += std::exp(-maximum); + } + + const float parameters[1] = {1.0f / accumulation}; + MlasComputeSoftmaxOutputF32Kernel(output, d, parameters); +} + +void RvvSoftmaxRow(const float* input, float* output, size_t d, bool log_softmax, bool smooth_softmax) { + auto& platform = GetMlasPlatform(); + + float maximum = platform.ReduceMaximumF32Kernel(input, d); + if (smooth_softmax && maximum < 0.0f) { + maximum = 0.0f; + } + + const float negative_maximum = -maximum; + + if (log_softmax) { + float accumulation = platform.ComputeSumExpF32Kernel(input, nullptr, d, &negative_maximum); + if (smooth_softmax) { + accumulation += std::exp(-maximum); + } + + const float parameters[2] = {negative_maximum, std::log(accumulation)}; + platform.ComputeLogSoftmaxOutputF32Kernel(input, output, d, parameters); + return; + } + + float accumulation = platform.ComputeSumExpF32Kernel(input, output, d, &negative_maximum); + if (smooth_softmax) { + accumulation += std::exp(-maximum); + } + + const float parameters[1] = {1.0f / accumulation}; + platform.ComputeSoftmaxOutputF32Kernel(output, d, parameters); +} + +CompareStats CompareCase(size_t rows, size_t d, bool log_softmax, bool smooth_softmax) { + std::vector input(rows * d); + std::vector scalar_output(rows * d); + std::vector rvv_output(rows * d); + + std::mt19937 rng( + static_cast(rows * 131 + d * 17 + (log_softmax ? 7 : 0) + (smooth_softmax ? 19 : 0))); + std::uniform_real_distribution dist(-150.0f, 190.0f); + + for (float& value : input) { + value = dist(rng); + } + + for (size_t row = 0; row < rows; ++row) { + const float* row_input = input.data() + row * d; + ScalarSoftmaxRow(row_input, scalar_output.data() + row * d, d, log_softmax, smooth_softmax); + RvvSoftmaxRow(row_input, rvv_output.data() + row * d, d, log_softmax, smooth_softmax); + } + + CompareStats stats; + for (size_t i = 0; i < rows * d; ++i) { + const float scalar = scalar_output[i]; + const float rvv = rvv_output[i]; + const float abs_diff = std::fabs(scalar - rvv); + const float rel_diff = abs_diff / std::max(std::fabs(scalar), 1.0e-12f); + stats.max_abs_diff = std::max(stats.max_abs_diff, abs_diff); + stats.max_rel_diff = std::max(stats.max_rel_diff, rel_diff); + stats.checksum_scalar += scalar; + stats.checksum_rvv += rvv; + } + + return stats; +} + +TimingStats TimeCase(size_t rows, size_t d, size_t repeats, bool log_softmax, bool smooth_softmax) { + std::vector input(rows * d); + std::vector scalar_output(rows * d); + std::vector rvv_output(rows * d); + + std::mt19937 rng(static_cast(rows * 97 + d * 29 + repeats)); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (float& value : input) { + value = dist(rng); + } + + const auto scalar_begin = std::chrono::steady_clock::now(); + for (size_t repeat = 0; repeat < repeats; ++repeat) { + for (size_t row = 0; row < rows; ++row) { + ScalarSoftmaxRow(input.data() + row * d, scalar_output.data() + row * d, d, log_softmax, smooth_softmax); + } + } + const auto scalar_end = std::chrono::steady_clock::now(); + + const auto rvv_begin = std::chrono::steady_clock::now(); + for (size_t repeat = 0; repeat < repeats; ++repeat) { + for (size_t row = 0; row < rows; ++row) { + RvvSoftmaxRow(input.data() + row * d, rvv_output.data() + row * d, d, log_softmax, smooth_softmax); + } + } + const auto rvv_end = std::chrono::steady_clock::now(); + + TimingStats stats; + stats.scalar_ms = + std::chrono::duration_cast >(scalar_end - scalar_begin).count(); + stats.rvv_ms = + std::chrono::duration_cast >(rvv_end - rvv_begin).count(); + return stats; +} + +void PrintCompareCase(const std::string& name, size_t rows, size_t d, bool log_softmax, bool smooth_softmax) { + const auto stats = CompareCase(rows, d, log_softmax, smooth_softmax); + std::cout << name << " rows=" << rows << " d=" << d << " log_softmax=" << log_softmax + << " smooth=" << smooth_softmax << '\n'; + std::cout << " max_abs_diff=" << std::setprecision(9) << stats.max_abs_diff + << " max_rel_diff=" << stats.max_rel_diff << '\n'; + std::cout << " checksum_scalar=" << std::setprecision(12) << stats.checksum_scalar + << " checksum_rvv=" << stats.checksum_rvv << '\n'; +} + +void PrintTimingCase( + const std::string& name, size_t rows, size_t d, size_t repeats, bool log_softmax, bool smooth_softmax) { + const auto stats = TimeCase(rows, d, repeats, log_softmax, smooth_softmax); + const double speedup = stats.rvv_ms > 0.0 ? stats.scalar_ms / stats.rvv_ms : 0.0; + std::cout << name << " rows=" << rows << " d=" << d << " repeats=" << repeats + << " log_softmax=" << log_softmax << " smooth=" << smooth_softmax << '\n'; + std::cout << " scalar_ms=" << std::fixed << std::setprecision(3) << stats.scalar_ms + << " rvv_ms=" << stats.rvv_ms << " speedup=" << speedup << "x\n"; +} + +} // namespace + +int main() { + auto& platform = GetMlasPlatform(); + + std::cout << std::boolalpha; + std::cout << "dispatch_is_rvv_reduce=" + << (platform.ReduceMaximumF32Kernel == MlasReduceMaximumF32KernelRvv) << '\n'; + std::cout << "dispatch_is_rvv_sumexp=" + << (platform.ComputeSumExpF32Kernel == MlasComputeSumExpF32KernelRvv) << '\n'; + std::cout << "dispatch_is_rvv_softmax=" + << (platform.ComputeSoftmaxOutputF32Kernel == MlasComputeSoftmaxOutputF32KernelRvv) << '\n'; + std::cout << "dispatch_is_rvv_logsoftmax=" + << (platform.ComputeLogSoftmaxOutputF32Kernel == MlasComputeLogSoftmaxOutputF32KernelRvv) << '\n'; + std::cout << '\n'; + + PrintCompareCase("regression_case_3x128_softmax", 3, 128, false, true); + PrintCompareCase("regression_case_3x128_logsoftmax", 3, 128, true, true); + PrintCompareCase("regression_case_63x95_softmax", 63, 95, false, true); + PrintCompareCase("regression_case_16x211_softmax", 16, 211, false, true); + std::cout << '\n'; + + PrintTimingCase("perf_case_attention_like", 4096, 128, 100, false, true); + PrintTimingCase("perf_case_long_seq", 1024, 1024, 20, false, true); + + return 0; +} + +#endif diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index b60615e0c967f..f56c81d2e89de 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -946,6 +946,224 @@ TEST(CoreMLExecutionProviderTest, HardSigmoidTest) { TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); #endif } + +TEST(CoreMLExecutionProviderTest, QuickGeluTest) { + // Single com.microsoft:QuickGelu node (produced by ORT's QuickGeluFusion pass + // from the pattern x * sigmoid(alpha * x)). Verify the CoreML MLProgram path + // claims the whole graph and produces the same output as CPU. + ONNX_NAMESPACE::ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION); + auto* onnx_opset = model_proto.add_opset_import(); + onnx_opset->set_domain(""); + onnx_opset->set_version(13); + auto* ms_opset = model_proto.add_opset_import(); + ms_opset->set_domain("com.microsoft"); + ms_opset->set_version(1); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("quick_gelu_test"); + + auto* input = graph_proto->add_input(); + input->set_name("X"); + auto* input_shape = input->mutable_type()->mutable_tensor_type()->mutable_shape(); + input->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_shape->add_dim()->set_dim_value(1); + input_shape->add_dim()->set_dim_value(3); + input_shape->add_dim()->set_dim_value(2); + input_shape->add_dim()->set_dim_value(4); + + auto* output = graph_proto->add_output(); + output->set_name("Y"); + auto* output_shape = output->mutable_type()->mutable_tensor_type()->mutable_shape(); + output->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + output_shape->add_dim()->set_dim_value(1); + output_shape->add_dim()->set_dim_value(3); + output_shape->add_dim()->set_dim_value(2); + output_shape->add_dim()->set_dim_value(4); + + auto* node = graph_proto->add_node(); + node->set_op_type("QuickGelu"); + node->set_domain("com.microsoft"); + node->add_input("X"); + node->add_output("Y"); + // Use a non-default alpha so the test catches any attribute-wiring bug. + auto* alpha_attr = node->add_attribute(); + alpha_attr->set_name("alpha"); + alpha_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT); + alpha_attr->set_f(1.5f); + + std::string model_data; + ASSERT_TRUE(model_proto.SerializeToString(&model_data)); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {1, 3, 2, 4}; + std::vector input_data = {-10.0f, -3.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 3.0f, + 10.0f, -5.0f, 5.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.25f, + -0.25f, 7.0f, -7.0f, 1.5f, -1.5f, 0.1f, -0.1f, 20.0f}; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + RunAndVerifyOutputsWithEP(model_span, "QuickGeluTest_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, QuickGeluTestAlphaOne) { + // alpha == 1.0 triggers the "skip leading mul" optimization in the op + // builder. Verify correctness on that branch — the emitted MIL graph is + // sigmoid(x) -> mul(x, sigmoid(x)) instead of the 3-op decomposition. + ONNX_NAMESPACE::ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION); + auto* onnx_opset = model_proto.add_opset_import(); + onnx_opset->set_domain(""); + onnx_opset->set_version(13); + auto* ms_opset = model_proto.add_opset_import(); + ms_opset->set_domain("com.microsoft"); + ms_opset->set_version(1); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("quick_gelu_alpha_one_test"); + + auto* input = graph_proto->add_input(); + input->set_name("X"); + auto* input_shape = input->mutable_type()->mutable_tensor_type()->mutable_shape(); + input->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + input_shape->add_dim()->set_dim_value(1); + input_shape->add_dim()->set_dim_value(3); + input_shape->add_dim()->set_dim_value(2); + input_shape->add_dim()->set_dim_value(4); + + auto* output = graph_proto->add_output(); + output->set_name("Y"); + auto* output_shape = output->mutable_type()->mutable_tensor_type()->mutable_shape(); + output->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + output_shape->add_dim()->set_dim_value(1); + output_shape->add_dim()->set_dim_value(3); + output_shape->add_dim()->set_dim_value(2); + output_shape->add_dim()->set_dim_value(4); + + auto* node = graph_proto->add_node(); + node->set_op_type("QuickGelu"); + node->set_domain("com.microsoft"); + node->add_input("X"); + node->add_output("Y"); + auto* alpha_attr = node->add_attribute(); + alpha_attr->set_name("alpha"); + alpha_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT); + alpha_attr->set_f(1.0f); + + std::string model_data; + ASSERT_TRUE(model_proto.SerializeToString(&model_data)); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {1, 3, 2, 4}; + std::vector input_data = {-10.0f, -3.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 3.0f, + 10.0f, -5.0f, 5.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.25f, + -0.25f, 7.0f, -7.0f, 1.5f, -1.5f, 0.1f, -0.1f, 20.0f}; + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + RunAndVerifyOutputsWithEP(model_span, "QuickGeluTestAlphaOne_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, + EPVerificationParams{ExpectedEPNodeAssignment::All}); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + +TEST(CoreMLExecutionProviderTest, QuickGeluTestFp16) { + // FLOAT16 variant of QuickGeluTest. Exercises the MLFloat16 branch of the + // alpha-scalar wiring in QuickGeluOpBuilder::AddToModelBuilderImpl. + ONNX_NAMESPACE::ModelProto model_proto; + model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION); + auto* onnx_opset = model_proto.add_opset_import(); + onnx_opset->set_domain(""); + onnx_opset->set_version(13); + auto* ms_opset = model_proto.add_opset_import(); + ms_opset->set_domain("com.microsoft"); + ms_opset->set_version(1); + + auto* graph_proto = model_proto.mutable_graph(); + graph_proto->set_name("quick_gelu_fp16_test"); + + auto* input = graph_proto->add_input(); + input->set_name("X"); + auto* input_shape = input->mutable_type()->mutable_tensor_type()->mutable_shape(); + input->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + input_shape->add_dim()->set_dim_value(1); + input_shape->add_dim()->set_dim_value(3); + input_shape->add_dim()->set_dim_value(2); + input_shape->add_dim()->set_dim_value(4); + + auto* output = graph_proto->add_output(); + output->set_name("Y"); + auto* output_shape = output->mutable_type()->mutable_tensor_type()->mutable_shape(); + output->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + output_shape->add_dim()->set_dim_value(1); + output_shape->add_dim()->set_dim_value(3); + output_shape->add_dim()->set_dim_value(2); + output_shape->add_dim()->set_dim_value(4); + + auto* node = graph_proto->add_node(); + node->set_op_type("QuickGelu"); + node->set_domain("com.microsoft"); + node->add_input("X"); + node->add_output("Y"); + auto* alpha_attr = node->add_attribute(); + alpha_attr->set_name("alpha"); + alpha_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT); + alpha_attr->set_f(1.5f); + + std::string model_data; + ASSERT_TRUE(model_proto.SerializeToString(&model_data)); + gsl::span model_span{reinterpret_cast(model_data.data()), model_data.size()}; + +#if defined(__APPLE__) + std::vector dims = {1, 3, 2, 4}; + const std::vector input_floats = {-10.0f, -3.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 3.0f, + 10.0f, -5.0f, 5.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.25f, + -0.25f, 7.0f, -7.0f, 1.5f, -1.5f, 0.1f, -0.1f, 20.0f}; + std::vector input_data; + input_data.reserve(input_floats.size()); + for (float f : input_floats) input_data.emplace_back(f); + + OrtValue ml_value_x; + AllocatorPtr allocator = CPUAllocator::DefaultInstance(); + CreateMLValue(allocator, dims, input_data, &ml_value_x); + + NameMLValMap feeds; + feeds.insert(std::make_pair("X", ml_value_x)); + + EPVerificationParams params{}; + params.ep_node_assignment = ExpectedEPNodeAssignment::All; + // fp16 accumulates larger absolute error than fp32 across the three-op + // decomposition (mul, sigmoid, mul). Outputs are bounded by |x|, max ~20 in + // this test; fp16 ulp at that magnitude is ~0.01, so 2e-2 leaves headroom. + params.fp32_abs_err = 2e-2f; + + RunAndVerifyOutputsWithEP(model_span, "QuickGeluTestFp16_MLProgram", + MakeCoreMLExecutionProvider("MLProgram"), + feeds, params); +#else + TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All); +#endif +} + #endif // !(ORT_MINIMAL_BUILD) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc index 6a3b0d8160d53..2f51b8a7a5690 100644 --- a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc @@ -1208,10 +1208,11 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_CUDA_Passthrough) { } test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, - std::vector(max_sequence_length * head_size / 2, 1.0f)); + std::vector(max_sequence_length * head_size / 2, 0.5f)); test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, - std::vector(max_sequence_length * head_size / 2, 0.0f)); + std::vector(max_sequence_length * head_size / 2, 0.866f)); // position_id = 2048 exceeds max_sequence_length = 8 — CUDA should pass through input unchanged. test.AddInput("position_ids", {batch_size, sequence_length}, {2048}); @@ -1291,5 +1292,125 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_RejectsRank3HiddenSizeNotDivisibleByNu "hidden_size=5 must be divisible by num_heads=2 for rank-3 input", {}, nullptr, &execution_providers); } +// Test that OOB position_ids on WebGPU pass through input unchanged (shader-side defense). +TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_WebGPU_Passthrough) { + if (nullptr == DefaultWebGpuExecutionProvider().get()) { + GTEST_SKIP() << "WebGPU execution provider is not available."; + } + + int batch_size = 1; + int sequence_length = 1; + int num_heads = 2; + int head_size = 4; + int max_sequence_length = 8; + int hidden_size = num_heads * head_size; + + OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain); + test.AddAttribute("interleaved", static_cast(0)); + test.AddAttribute("num_heads", static_cast(num_heads)); + + std::vector input_data(hidden_size); + for (int i = 0; i < hidden_size; ++i) { + input_data[i] = static_cast(i + 1); + } + + test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. + test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.5f)); + test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.866f)); + // position_id = 2048 exceeds max_sequence_length = 8 — shader passes through input unchanged. + test.AddInput("position_ids", {batch_size, sequence_length}, {2048}); + + // Output should equal input when position_id is OOB (pass-through). + test.AddOutput("output", {batch_size, sequence_length, hidden_size}, input_data); + test.SetOutputAbsErr("output", 0.0f); + + std::vector> execution_providers; + execution_providers.push_back(DefaultWebGpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +// Test that negative position_ids pass through on WebGPU (shader-side defense catches raw_pos < 0). +TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Negative_WebGPU_Passthrough) { + if (nullptr == DefaultWebGpuExecutionProvider().get()) { + GTEST_SKIP() << "WebGPU execution provider is not available."; + } + + int batch_size = 1; + int sequence_length = 1; + int num_heads = 2; + int head_size = 4; + int max_sequence_length = 8; + int hidden_size = num_heads * head_size; + + OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain); + test.AddAttribute("interleaved", static_cast(0)); + test.AddAttribute("num_heads", static_cast(num_heads)); + + std::vector input_data(hidden_size); + for (int i = 0; i < hidden_size; ++i) { + input_data[i] = static_cast(i + 1); + } + + test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. + test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.5f)); + test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.866f)); + // Negative position_id — shader checks raw_pos < 0 and passes through. + test.AddInput("position_ids", {batch_size, sequence_length}, {-1}); + + // Output should equal input when position_id is negative (pass-through). + test.AddOutput("output", {batch_size, sequence_length, hidden_size}, input_data); + test.SetOutputAbsErr("output", 0.0f); + + std::vector> execution_providers; + execution_providers.push_back(DefaultWebGpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +// Test that OOB position_ids in a batch pass through on WebGPU (shader-side defense). +TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU_Passthrough) { + if (nullptr == DefaultWebGpuExecutionProvider().get()) { + GTEST_SKIP() << "WebGPU execution provider is not available."; + } + + int batch_size = 2; + int sequence_length = 2; + int num_heads = 2; + int head_size = 4; + int max_sequence_length = 8; + int hidden_size = num_heads * head_size; + + OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain); + test.AddAttribute("interleaved", static_cast(0)); + test.AddAttribute("num_heads", static_cast(num_heads)); + + std::vector input_data(batch_size * sequence_length * hidden_size); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(i + 1); + } + + test.AddInput("input", {batch_size, sequence_length, hidden_size}, input_data); + // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output. + test.AddInput("cos_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.5f)); + test.AddInput("sin_cache", {max_sequence_length, head_size / 2}, + std::vector(max_sequence_length * head_size / 2, 0.866f)); + // All OOB position_ids — shader passes through input unchanged. + test.AddInput("position_ids", {batch_size, sequence_length}, {100, 200, 300, 400}); + + // Output should equal input when all position_ids are OOB (pass-through). + test.AddOutput("output", {batch_size, sequence_length, hidden_size}, input_data); + test.SetOutputAbsErr("output", 0.0f); + + std::vector> execution_providers; + execution_providers.push_back(DefaultWebGpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc index 45b961ee21849..f624ecf57d05e 100644 --- a/onnxruntime/test/providers/cpu/math/matmul_test.cc +++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc @@ -181,6 +181,38 @@ std::vector> GenerateTestCases() { // clang-format on })}); + test_cases.push_back( + {"test 3D tensors with batchA = 3, M = 2, N = 3", + {3, 2, 8}, + {1, 8, 3}, + {3, 2, 3}, + real_expected_vals({ + // clang-format off + 420, 448, 476, + 1092, 1184, 1276, + 1764, 1920, 2076, + 2436, 2656, 2876, + 3108, 3392, 3676, + 3780, 4128, 4476, + // clang-format on + })}); + + test_cases.push_back( + {"test 3D tensors with batchA = 3, M = 2, N = 4", + {3, 2, 8}, + {1, 8, 4}, + {3, 2, 4}, + real_expected_vals({ + // clang-format off + 560, 588, 616, 644, + 1456, 1548, 1640, 1732, + 2352, 2508, 2664, 2820, + 3248, 3468, 3688, 3908, + 4144, 4428, 4712, 4996, + 5040, 5388, 5736, 6084, + // clang-format on + })}); + test_cases.push_back( {"test 4D tensors with M = 1", {2, 3, 1, 8}, @@ -598,6 +630,63 @@ TEST(MathOpTest, MatMulSharedPrepackedWeights) { } } +// Test MatMul with batch_size > 1 that exercises the Split-K path. +// Split-K is triggered when dim_inner is large relative to dim_a_outer * dim_b_outer, +// is_vec4 is true, and the GPU supports it. This test validates correctness when +// batch_size > 1 with dimensions that would trigger Split-K on supported hardware. +TEST(MathOpTest, MatMulBatchedSplitK) { + // Dimensions chosen so dim_inner is large (triggers Split-K) and vec4-compatible. + // batch=2, M=4, K=768, N=64 + constexpr int64_t batch = 2; + constexpr int64_t M = 4; + constexpr int64_t K = 768; + constexpr int64_t N = 64; + + std::vector A_shape = {batch, M, K}; + std::vector B_shape = {batch, K, N}; + std::vector Y_shape = {batch, M, N}; + + // Generate sequential data so the expected output is deterministic. + int64_t a_size = batch * M * K; + int64_t b_size = batch * K * N; + std::vector A_data(a_size); + std::vector B_data(b_size); + + // Use small values to avoid fp32 overflow. + for (int64_t i = 0; i < a_size; ++i) { + A_data[i] = static_cast((i % 11) - 5) * 0.01f; + } + for (int64_t i = 0; i < b_size; ++i) { + B_data[i] = static_cast((i % 13) - 6) * 0.01f; + } + + // Compute expected output on CPU. + std::vector expected(batch * M * N, 0.0f); + for (int64_t b_idx = 0; b_idx < batch; ++b_idx) { + for (int64_t m = 0; m < M; ++m) { + for (int64_t n = 0; n < N; ++n) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) { + float a_val = A_data[b_idx * M * K + m * K + k]; + float b_val = B_data[b_idx * K * N + k * N + n]; + sum += a_val * b_val; + } + expected[b_idx * M * N + m * N + n] = sum; + } + } + } + + OpTester test("MatMul", 13); + test.AddInput("A", A_shape, A_data); + test.AddInput("B", B_shape, B_data); + test.AddOutput("Y", Y_shape, expected); + + // Exclude providers that don't support this configuration. + test.ConfigExcludeEps({kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kQnnExecutionProvider}) + .Config(run_with_tunable_op) + .RunWithConfig(); +} + #endif } // namespace test diff --git a/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc b/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc index 6f80b6f1dfb7a..8083874213b1f 100644 --- a/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc +++ b/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc @@ -129,7 +129,7 @@ TEST(MLOpTest, LinearClassifierBinaryWithLabels) { TEST(MLOpTest, LinearClassifierInvalidCoefficientsSize) { OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain); - test.AddAttribute("coefficients", std::vector{1.f, 2.f}); + test.AddAttribute("coefficients", std::vector{1.f, 2.f, 3.f}); test.AddAttribute("intercepts", std::vector{0.f, 0.f}); test.AddAttribute("classlabels_ints", std::vector{0, 1}); @@ -202,6 +202,26 @@ TEST(MLOpTest, LinearClassifierInvalidCoefficientsSizeFails) { "coefficients size (3) is less than class_count (3) * num_features (2)"); } +TEST(MLOpTest, LinearClassifierExtraCoefficientsAreIgnored) { + OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain); + + std::vector coefficients = {-0.22562418f, 0.34188559f, 0.68346153f, + -0.68051993f, -0.1975279f, 0.03748541f, + 101.f, 102.f, 103.f}; + std::vector classes = {1, 2, 3}; + std::vector intercepts = {-3.91601811f, 0.42575697f, 0.13731251f}; + + test.AddAttribute("coefficients", coefficients); + test.AddAttribute("intercepts", intercepts); + test.AddAttribute("classlabels_ints", classes); + + test.AddInput("X", {1, 2}, {1.f, 0.f}); + test.AddOutput("Y", {1}, {2LL}); + test.AddOutput("Z", {1, 3}, {-4.14164229f, 1.1092185f, -0.06021539f}); + + test.Run(); +} + // Regression test: coefficients not divisible by class_count. TEST(MLOpTest, LinearClassifierCoefficientsSizeNotDivisibleByClassCountFails) { OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain); @@ -220,7 +240,27 @@ TEST(MLOpTest, LinearClassifierCoefficientsSizeNotDivisibleByClassCountFails) { test.AddOutput("Z", {1, 3}, {0.f, 0.f, 0.f}); test.Run(OpTester::ExpectResult::kExpectFailure, - "coefficients size (5) must be a multiple of the number of classes (3)"); + "coefficients size (5) is less than class_count (3) * num_features (2)"); +} + +TEST(MLOpTest, LinearClassifierInputFeatureCountMismatchFails) { + OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain); + + std::vector coefficients = {-0.22562418f, 0.34188559f, 0.68346153f, + -0.68051993f, -0.1975279f, 0.03748541f}; + std::vector classes = {1, 2, 3}; + std::vector intercepts = {-3.91601811f, 0.42575697f, 0.13731251f}; + + test.AddAttribute("coefficients", coefficients); + test.AddAttribute("intercepts", intercepts); + test.AddAttribute("classlabels_ints", classes); + + test.AddInput("X", {1, 3}, {1.f, 0.f, 0.f}); + test.AddOutput("Y", {1}, {0LL}); + test.AddOutput("Z", {1, 3}, {0.f, 0.f, 0.f}); + + test.Run(OpTester::ExpectResult::kExpectFailure, + "coefficients size (6) is less than class_count (3) * num_features (3)"); } } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc b/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc index 3c5b71b90b4b8..2c89c03b6791b 100644 --- a/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc +++ b/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc @@ -297,6 +297,31 @@ TEST(MLOpTest, SVMClassifierUndersizedCoefficients) { test.Run(OpTester::ExpectResult::kExpectFailure, "coefficients attribute size"); } +TEST(MLOpTest, SVMClassifierVectorsPerClassSizeMismatch) { + OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain); + + std::vector coefficients = {1.f, 1.f, 1.f, 1.f}; + std::vector support_vectors = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f}; + std::vector rho = {0.1f, 0.1f, 0.1f}; + std::vector kernel_params = {0.01f, 0.f, 3.f}; + std::vector classes = {0, 1, 2}; + std::vector vectors_per_class = {1, 1}; // needs one entry per class + + test.AddAttribute("kernel_type", std::string("RBF")); + test.AddAttribute("coefficients", coefficients); + test.AddAttribute("support_vectors", support_vectors); + test.AddAttribute("vectors_per_class", vectors_per_class); + test.AddAttribute("rho", rho); + test.AddAttribute("kernel_params", kernel_params); + test.AddAttribute("classlabels_ints", classes); + + test.AddInput("X", {1, 4}, {0.f, 0.f, 0.f, 0.f}); + test.AddOutput("Y", {1}, {1}); + test.AddOutput("Z", {1, 3}, {0.f, 0.f, 0.f}); + + test.Run(OpTester::ExpectResult::kExpectFailure, "vectors_per_class"); +} + TEST(MLOpTest, SVMClassifierInvalidInputFeatureCount) { OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain); @@ -438,5 +463,47 @@ TEST(MLOpTest, SVMClassifierDifferentSizeKernelParameters) { test.Run(OpTester::ExpectResult::kExpectFailure, "kernel_params must be empty or have 3 values"); } +TEST(MLOpTest, SVMClassifierSVCLinearUndersizedVectorPerClass) { + OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain); + + std::vector coefficients = {0.766398549079895f, 0.0871576070785522f, 0.110420741140842f, + -0.963976919651031f}; + std::vector support_vectors = {4.80000019073486f, 3.40000009536743f, 1.89999997615814f, + 5.f, 3.f, 1.60000002384186f, + 4.5f, 2.29999995231628f, 1.29999995231628f, + 5.09999990463257f, 2.5f, 3.f}; + std::vector rho = {2.23510527610779f}; + std::vector kernel_params = {0.122462183237076f, 0.f, 3.f}; // gamma, coef0, degree + std::vector classes = {0, 1}; + std::vector vectors_per_class = {3}; // undersized: 2 classes but only 1 entry + + std::vector X = {5.1f, 3.5f, 1.4f, + 4.9f, 3.f, 1.4f, + 4.7f, 3.2f, 1.3f, + 4.6f, 3.1f, 1.5f, + 5.f, 3.6f, 1.4f}; + std::vector scores_predictions = {-1.5556798f, 1.5556798f, + -1.2610321f, 1.2610321f, + -1.5795376f, 1.5795376f, + -1.3083477f, 1.3083477f, + -1.6572928f, 1.6572928f}; + + std::vector class_predictions = {0, 0, 0, 0, 0}; + + test.AddAttribute("kernel_type", std::string("LINEAR")); + test.AddAttribute("coefficients", coefficients); + test.AddAttribute("support_vectors", support_vectors); + test.AddAttribute("vectors_per_class", vectors_per_class); + test.AddAttribute("rho", rho); + test.AddAttribute("kernel_params", kernel_params); + test.AddAttribute("classlabels_ints", classes); + + test.AddInput("X", {5, 3}, X); + test.AddOutput("Y", {5}, class_predictions); + test.AddOutput("Z", {5, 2}, scores_predictions); + + test.Run(OpTester::ExpectResult::kExpectFailure, "Mismatch between classlabels_ints/classlabels_strings and vectors_per_class dimensions."); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index f3e233fd69a64..25d37846a2028 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -888,6 +888,97 @@ TEST(ConvTest, Conv2D_MatMul_SplitK_With_Bias) { TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); } +TEST(ConvTest, Conv2D_MatMul_Batched_No_Bias) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + std::vector{1, 1}, // dilations + 1, // group + std::vector{1, 1}, // kernel_shape + std::vector{0, 0, 0, 0}, // pads + std::vector{1, 1}, // strides + {} // excluded EPs + }; + + constexpr int64_t batch = 2; // batch > 1 + constexpr int64_t M = 16; + constexpr int64_t K = 768; + constexpr int64_t N = 64; + + std::vector X_shape = {batch, K, M, 1}; + std::vector W_shape = {N, K, 1, 1}; + std::vector Y_shape = {batch, N, M, 1}; + + RandomValueGenerator random{5678}; + std::vector X(random.Gaussian(AsSpan(X_shape), 0.0f, 0.025f)); + std::vector W(random.Gaussian(AsSpan(W_shape), 0.0f, 0.025f)); + + std::vector expected_vals(batch * N * M, 0.0f); + for (int64_t b = 0; b < batch; ++b) { + for (int64_t m = 0; m < M; ++m) { + for (int64_t n = 0; n < N; ++n) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) { + int x_index = static_cast(b * K * M + k * M + m); + int w_index = static_cast(n * K + k); + sum += X[x_index] * W[w_index]; + } + int y_index = static_cast(b * N * M + n * M + m); + expected_vals[y_index] = sum; + } + } + } + + TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Conv2D_MatMul_Batched_With_Bias) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + std::vector{1, 1}, // dilations + 1, // group + std::vector{1, 1}, // kernel_shape + std::vector{0, 0, 0, 0}, // pads + std::vector{1, 1}, // strides + {} // excluded EPs + }; + + constexpr int64_t batch = 2; + constexpr int64_t M = 16; + constexpr int64_t K = 768; + constexpr int64_t N = 64; + + std::vector X_shape = {batch, K, M, 1}; + std::vector W_shape = {N, K, 1, 1}; + std::vector Y_shape = {batch, N, M, 1}; + std::vector B_shape = {N}; + + RandomValueGenerator random{5678}; + std::vector X(random.Gaussian(AsSpan(X_shape), 0.0f, 0.025f)); + std::vector W(random.Gaussian(AsSpan(W_shape), 0.0f, 0.025f)); + std::vector B(random.Gaussian(AsSpan(B_shape), 0.0f, 0.25f)); + + std::vector expected_vals(batch * N * M, 0.0f); + for (int64_t b = 0; b < batch; ++b) { + for (int64_t m = 0; m < M; ++m) { + for (int64_t n = 0; n < N; ++n) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) { + int x_index = static_cast(b * K * M + k * M + m); + int w_index = static_cast(n * K + k); + sum += X[x_index] * W[w_index]; + } + sum += B[static_cast(n)]; + int y_index = static_cast(b * N * M + n * M + m); + expected_vals[y_index] = sum; + } + } + } + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + // Conv10 TEST(ConvTest, Conv3D_1) { ConvOpAndTestAttributes attrs = { diff --git a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc index 6004ae8e18c05..86a58c6e890f0 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc @@ -525,7 +525,7 @@ TEST(ConvTransposeTest, ConvTranspose_InvalidKernelShape) { // so drop the part that differs from the expected string "kernel_shape num_dims is not compatible with W num_dims. kernel_shape: {1,1,1,5} W: {1,1,", {kTensorrtExecutionProvider, kQnnExecutionProvider, - kDmlExecutionProvider}); // TODO: Unskip when fixed #41968513 + kDmlExecutionProvider, kOpenVINOExecutionProvider}); // TODO: Unskip when fixed #41968513 } TEST(ConvTransposeTest, ConvTranspose_InvalidBiasShape_1) { diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc index ba3bf869b7f0a..f10aa5a49c120 100755 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc @@ -23,6 +23,10 @@ std::vector> GetExecutionProviders() { execution_providers.push_back(DefaultCoreMLExecutionProvider(/*use_mlprogram*/ true)); #endif +#ifdef USE_WEBGPU + execution_providers.push_back(DefaultWebGpuExecutionProvider()); +#endif + return execution_providers; } diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc index 393adbede82fb..79b4156f5d6c0 100644 --- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc @@ -1454,6 +1454,9 @@ void QuantizeLinearOp19Test(bool saturate) { } TEST(QuantizeLinearOpTest, Float8) { +#ifdef USE_OPENVINO + GTEST_SKIP() << "Skipping Float8 QuantizeLinear test for OpenVINO EP"; +#endif constexpr int min_cuda_architecture = 11080; bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); @@ -1496,6 +1499,9 @@ void QuantizeLinearOp19F16Test(bool saturate) { } TEST(QuantizeLinearOpMLFloat16Test, Float8) { +#ifdef USE_OPENVINO + GTEST_SKIP() << "Skipping Float8 QuantizeLinear test for OpenVINO EP"; +#endif constexpr int min_cuda_architecture = 11080; bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py index 416d9b6edecd1..bb83f6d36011f 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py @@ -2,6 +2,8 @@ # Licensed under the MIT License. # -*- coding: UTF-8 -*- +import os +import tempfile import unittest import numpy as np @@ -64,5 +66,128 @@ def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self): assert_allclose(session_run_results[0], -(inp0 - inp1)) +class TestBackendKwargsAllowlist(unittest.TestCase): + """Tests that the SessionOptions/RunOptions kwargs allowlist correctly blocks + dangerous attributes and allows safe ones, preventing arbitrary file writes + through user-controlled kwargs.""" + + def test_blocked_session_option_optimized_model_filepath_raises(self): + """optimized_model_filepath is a known SessionOptions attr but is not in the allowlist. + It must raise RuntimeError to prevent arbitrary file overwrites.""" + name = get_name("mul_1.onnx") + with tempfile.NamedTemporaryFile(suffix=".bin") as tmp: + with self.assertRaises(RuntimeError) as ctx: + backend.prepare(name, optimized_model_filepath=tmp.name) + self.assertIn("not permitted", str(ctx.exception)) + + def test_blocked_session_option_profile_file_prefix_raises(self): + """profile_file_prefix is a known SessionOptions attr but is not in the allowlist. + It must raise RuntimeError to prevent arbitrary file writes via profiling output.""" + name = get_name("mul_1.onnx") + with tempfile.TemporaryDirectory() as tmpdir: + prefix = os.path.join(tmpdir, "profile") + with self.assertRaises(RuntimeError) as ctx: + backend.prepare(name, profile_file_prefix=prefix) + self.assertIn("not permitted", str(ctx.exception)) + + def test_blocked_session_option_enable_profiling_raises(self): + """enable_profiling is excluded from the allowlist because it causes uncontrolled + file writes (profiling JSON) to the current working directory.""" + name = get_name("mul_1.onnx") + with self.assertRaises(RuntimeError) as ctx: + backend.prepare(name, enable_profiling=True) + self.assertIn("not permitted", str(ctx.exception)) + + def test_unknown_kwarg_is_silently_ignored(self): + """A kwarg that is not a SessionOptions attribute at all must be silently ignored. + This preserves backward compatibility for callers who pass extra kwargs.""" + name = get_name("mul_1.onnx") + rep = backend.prepare(name, totally_unknown_kwarg="foo") + self.assertIsNotNone(rep) + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + res = rep.run(x) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) + + def test_safe_session_option_graph_optimization_level_is_accepted(self): + """graph_optimization_level is in the allowlist and must be accepted without error.""" + name = get_name("mul_1.onnx") + rep = backend.prepare(name, graph_optimization_level=onnxrt.GraphOptimizationLevel.ORT_DISABLE_ALL) + self.assertIsNotNone(rep) + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + res = rep.run(x) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) + + def test_safe_session_option_intra_op_num_threads_is_accepted(self): + """intra_op_num_threads is in the allowlist and must be accepted without error.""" + name = get_name("mul_1.onnx") + rep = backend.prepare(name, intra_op_num_threads=1) + self.assertIsNotNone(rep) + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + res = rep.run(x) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) + + def test_blocked_run_option_terminate_raises(self): + """terminate is a known RunOptions attr excluded from the allowlist; BackendRep.run() must raise RuntimeError when it is passed.""" + name = get_name("mul_1.onnx") + rep = backend.prepare(name) + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + with self.assertRaises(RuntimeError) as ctx: + rep.run(x, terminate=True) + self.assertIn("not permitted", str(ctx.exception)) + + def test_run_model_with_safe_session_option(self): + """run_model() must accept safe SessionOptions kwargs and produce correct output.""" + name = get_name("mul_1.onnx") + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + res = backend.run(name, [x], graph_optimization_level=onnxrt.GraphOptimizationLevel.ORT_DISABLE_ALL) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) + + def test_run_model_with_safe_run_option(self): + """run_model() must accept safe RunOptions kwargs and produce correct output.""" + name = get_name("mul_1.onnx") + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + res = backend.run(name, [x], only_execute_path_to_fetches=True) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) + + def test_run_model_with_blocked_run_option_raises(self): + """run_model() must raise RuntimeError when given a blocked RunOptions attribute.""" + name = get_name("mul_1.onnx") + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + with self.assertRaises(RuntimeError) as ctx: + backend.run(name, [x], terminate=True) + self.assertIn("not permitted", str(ctx.exception)) + + def test_unknown_kwarg_is_silently_ignored_in_run(self): + """A kwarg unknown to RunOptions must be silently ignored by rep.run().""" + name = get_name("mul_1.onnx") + rep = backend.prepare(name) + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + res = rep.run(x, completely_unknown_key="bar") + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) + + def test_unknown_kwarg_is_silently_ignored_in_run_model(self): + """An unknown kwarg must be silently ignored by both prepare() and rep.run() in run_model().""" + name = get_name("mul_1.onnx") + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + res = backend.run(name, [x], completely_unknown_key="baz") + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08) + + def test_run_model_with_blocked_session_option_raises(self): + """run_model() must raise RuntimeError when given a blocked SessionOptions attribute.""" + name = get_name("mul_1.onnx") + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + with tempfile.NamedTemporaryFile(suffix=".bin") as tmp: + with self.assertRaises(RuntimeError) as ctx: + backend.run(name, [x], optimized_model_filepath=tmp.name) + self.assertIn("not permitted", str(ctx.exception)) + + if __name__ == "__main__": unittest.main(module=__name__, buffer=True) diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py index d6c1dd9cff3f3..987efd5af5e8e 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py @@ -76,6 +76,35 @@ def test_ort_value_update_in_place(self): ortvalue_gpu.update_inplace(x1) np.testing.assert_allclose(ortvalue_gpu.numpy(), x1) + def test_ort_value_update_in_place_from_ortvalue(self): + # Test CPU to CPU copy via OrtValue + x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32) + + ortvalue_dst = onnxrt.OrtValue.ortvalue_from_numpy(x0) + ortvalue_src = onnxrt.OrtValue.ortvalue_from_numpy(x1) + ortvalue_dst.update_inplace(ortvalue_src) + np.testing.assert_allclose(ortvalue_dst.numpy(), x1) + + if "CUDAExecutionProvider" in onnxrt.get_available_providers(): + # Test GPU to GPU copy via OrtValue + ortvalue_gpu_dst = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0) + ortvalue_gpu_src = onnxrt.OrtValue.ortvalue_from_numpy(x1, "cuda", 0) + ortvalue_gpu_dst.update_inplace(ortvalue_gpu_src) + np.testing.assert_allclose(ortvalue_gpu_dst.numpy(), x1) + + # Test CPU OrtValue to GPU OrtValue copy + ortvalue_gpu_dst2 = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0) + ortvalue_cpu_src = onnxrt.OrtValue.ortvalue_from_numpy(x1) + ortvalue_gpu_dst2.update_inplace(ortvalue_cpu_src) + np.testing.assert_allclose(ortvalue_gpu_dst2.numpy(), x1) + + # Test GPU OrtValue to CPU OrtValue copy + ortvalue_cpu_dst = onnxrt.OrtValue.ortvalue_from_numpy(x0) + ortvalue_gpu_src2 = onnxrt.OrtValue.ortvalue_from_numpy(x1, "cuda", 0) + ortvalue_cpu_dst.update_inplace(ortvalue_gpu_src2) + np.testing.assert_allclose(ortvalue_cpu_dst.numpy(), x1) + def test_select_ep_to_run_cuda_graph(self): if "TensorrtExecutionProvider" in onnxrt.get_available_providers(): providers = [("TensorrtExecutionProvider", {"trt_cuda_graph_enable": True})] diff --git a/onnxruntime/test/python/transformers/test_paged_attention_cuda.py b/onnxruntime/test/python/transformers/test_paged_attention_cuda.py index 66eb4a885620b..fda861c8125ff 100644 --- a/onnxruntime/test/python/transformers/test_paged_attention_cuda.py +++ b/onnxruntime/test/python/transformers/test_paged_attention_cuda.py @@ -262,6 +262,7 @@ def paged_attention_func( cos=None, sin=None, window_size=-1, + sdpa_kernel=0, ): num_tokens = cumulative_sequence_length[-1].item() num_blocks = key_cache.shape[0] @@ -282,7 +283,11 @@ def paged_attention_func( "block_table": block_table.detach().cpu().numpy(), } sess_options = SessionOptions() - ort_session = InferenceSession(onnx_model_str, sess_options, providers=[config.ep]) + if sdpa_kernel != 0 and config.ep == "CUDAExecutionProvider": + providers = [(config.ep, {"sdpa_kernel": str(sdpa_kernel)})] + else: + providers = [config.ep] + ort_session = InferenceSession(onnx_model_str, sess_options, providers=providers) io_binding = ort_session.io_binding() if key is not None and value is not None: ort_inputs["key"] = key.detach().cpu().numpy() @@ -490,6 +495,7 @@ def parity_check_paged_attention( config: Config, rtol=1e-3, atol=1e-3, + sdpa_kernel=0, ): # Generate padded inputs q = torch.randn( @@ -620,6 +626,7 @@ def parity_check_paged_attention( cos, sin, left_window_size, + sdpa_kernel=sdpa_kernel, ) num_tokens = q_unpad.shape[0] out = torch.reshape(out, (num_tokens, config.num_heads, config.head_size)) @@ -672,6 +679,25 @@ def has_flash_attention(): ) +def has_memory_efficient_attention(): + # CUTLASS fMHA (MemoryEfficientAttention) gate — these tests are fp16-only, + # so sm>=53 is sufficient. bf16 MEA would require sm>=80 but is not covered here. + if not torch.cuda.is_available(): + return False + if "CUDAExecutionProvider" not in get_available_providers(): + return False + major, minor = torch.cuda.get_device_capability() + return (major * 10 + minor) >= 53 + + +# Bit value matching AttentionBackend::EFFICIENT_ATTENTION in +# onnxruntime/contrib_ops/cpu/bert/attention_common.h. Passing this as the +# CUDA provider option `sdpa_kernel` forces the PagedAttention kernel to +# select the MemoryEfficientAttention (CUTLASS fMHA) fallback even on SM>=80 +# where FlashAttention would otherwise be preferred. +SDPA_KERNEL_EFFICIENT_ATTENTION = 2 + + def paged_attention_test_cases(): batches = [4] if pipeline_mode else [1, 3, 5] seqs = ( @@ -732,5 +758,25 @@ def test_paged_attention(self, _, config): parity_check_paged_attention(config, rtol=5e-3, atol=5e-3) +@unittest.skipIf( + not has_memory_efficient_attention(), + reason="MemoryEfficientAttention (fp16) requires sm>=53; skipping.", +) +class TestPagedAttentionMEA(unittest.TestCase): + """Runs the same parity matrix as TestPagedAttention but forces the CUTLASS + memory-efficient attention fallback via the `sdpa_kernel` CUDA provider option. + This is the only coverage for the SM<80 fallback path introduced for PagedAttention; + on SM>=80 the class still runs to exercise the MEA dispatch end-to-end.""" + + @parameterized.expand(paged_attention_test_cases()) + def test_paged_attention_mea(self, _, config): + parity_check_paged_attention( + config, + rtol=5e-3, + atol=5e-3, + sdpa_kernel=SDPA_KERNEL_EFFICIENT_ATTENTION, + ) + + if __name__ == "__main__": unittest.main(verbosity=2) diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc index ba16bd6c9888f..497298474b36a 100644 --- a/onnxruntime/test/shared_lib/test_nontensor_types.cc +++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc @@ -1256,4 +1256,100 @@ TEST(CApiTest, SparseTensorFillSparseFormatStringsAPI) { } } } + +#if !defined(ORT_NO_EXCEPTIONS) +TEST(CApiTest, SparseTensorInvalidIndicesValidation) { + auto allocator = Ort::AllocatorWithDefaultOptions(); + Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + + // Common dense shape and values + const std::vector dense_shape{3, 3}; + Ort::Value::Shape ort_dense_shape{dense_shape.data(), dense_shape.size()}; + std::vector values = {1, 1, 1}; + constexpr int64_t values_len = 3; + + // + // COO Negative linear index + // + { + auto coo_st = Ort::Value::CreateSparseTensor(allocator, ort_dense_shape); + std::vector linear_indices = {-1, 3, 5}; + ASSERT_THROW( + coo_st.FillSparseTensorCoo(info, {&values_len, 1U, {values.data()}}, + linear_indices.data(), linear_indices.size()), + Ort::Exception); + } + + // + // COO Linear index out of upper bounds + // + { + auto coo_st = Ort::Value::CreateSparseTensor(allocator, ort_dense_shape); + std::vector linear_indices = {0, 3, 9}; // 9 is out of bounds for 3x3=9 (0-8) + ASSERT_THROW( + coo_st.FillSparseTensorCoo(info, {&values_len, 1U, {values.data()}}, + linear_indices.data(), linear_indices.size()), + Ort::Exception); + } + + // + // COO 2D indices out of row bounds + // + { + auto coo_st = Ort::Value::CreateSparseTensor(allocator, ort_dense_shape); + std::vector dim_indices = { + 0, 1, // Valid + 3, 0, // Invalid row 3 + 2, 2 // Valid + }; + ASSERT_THROW( + coo_st.FillSparseTensorCoo(info, {&values_len, 1U, {values.data()}}, + dim_indices.data(), dim_indices.size()), + Ort::Exception); + } + + // + // CSR inner index out of column bounds + // + { + auto csr_st = Ort::Value::CreateSparseTensor(allocator, ort_dense_shape); + std::vector inner_indices = {1, 3, 1}; // 3 is out of bounds for 3 cols (0-2) + std::vector outer_indices = {0, 1, 2, 3}; + ASSERT_THROW( + csr_st.FillSparseTensorCsr(info, {&values_len, 1U, {values.data()}}, + inner_indices.data(), inner_indices.size(), + outer_indices.data(), outer_indices.size()), + Ort::Exception); + } + + // + // CSR outer index not monotonically non-decreasing + // + { + auto csr_st = Ort::Value::CreateSparseTensor(allocator, ort_dense_shape); + std::vector inner_indices = {0, 1, 2}; + std::vector outer_indices = {0, 2, 1, 3}; // Drops from 2 to 1 + ASSERT_THROW( + csr_st.FillSparseTensorCsr(info, {&values_len, 1U, {values.data()}}, + inner_indices.data(), inner_indices.size(), + outer_indices.data(), outer_indices.size()), + Ort::Exception); + } + + // + // CSR outer index out of upper bounds (greater than inner_indices.size()) + // + { + auto csr_st = Ort::Value::CreateSparseTensor(allocator, ort_dense_shape); + std::vector inner_indices = {0, 1, 2}; + std::vector outer_indices = {0, 1, 2, 4}; // 4 is > inner_indices.size() (3) + ASSERT_THROW( + csr_st.FillSparseTensorCsr(info, {&values_len, 1U, {values.data()}}, + inner_indices.data(), inner_indices.size(), + outer_indices.data(), outer_indices.size()), + Ort::Exception); + } +} +#endif // !defined(ORT_NO_EXCEPTIONS) + #endif // !defined(DISABLE_SPARSE_TENSORS) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 4b231011832e0..f42617ba1b04c 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -888,6 +888,9 @@ def generate_build_tree( if args.enable_arm_neon_nchwc: cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"] + if args.enable_rvv: + cmake_args += ["-Donnxruntime_USE_RVV=ON"] + if not args.no_sve: cmake_args += ["-Donnxruntime_USE_SVE=ON"] diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index e30c5f8979183..b40bf4c2b25c6 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -673,6 +673,11 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: cpu_group.add_argument( "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels." ) + cpu_group.add_argument( + "--enable_rvv", + action="store_true", + help="Enable riscv64 MLAS kernels that use the RISC-V Vector extension.", + ) # --- DNNL (formerly MKL-DNN / oneDNN) --- dnnl_group = parser.add_argument_group("DNNL Execution Provider") diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index 3b28f80f8ec1c..5bcdcc2e1ecee 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -52,3 +52,4 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Tanh|| |ai.onnx:Transpose|| |ai.onnx:Unsqueeze|| +|com.microsoft:QuickGelu|Produced by ORT's `QuickGeluFusion` optimizer pass. Decomposed into `mul` / `sigmoid` / `mul`.| diff --git a/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml index 1d432b662034b..f816c915031a9 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml @@ -66,3 +66,17 @@ extends: cudnn_folder: '9.14.0.64_cuda13' cmake_cuda_archs: '75-real;80-real;86-real;89-real;90-real;100-real;120-real;120-virtual' docker_base_image: 'onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1' + docker_base_image_aarch64: 'onnxruntimebuildcache.azurecr.io/public/azureml/onnxruntime_build_cuda13_aarch64_almalinux9_gcc14:20260323.1' + AArch64LinuxPythonConfigurations: + - python_version: '3.11' + docker_python_exe_path: '/opt/python/cp311-cp311/bin/python3.11' + - python_version: '3.12' + docker_python_exe_path: '/opt/python/cp312-cp312/bin/python3.12' + - python_version: '3.13' + docker_python_exe_path: '/opt/python/cp313-cp313/bin/python3.13' + - python_version: '3.13t' + docker_python_exe_path: '/opt/python/cp313-cp313t/bin/python3.13' + - python_version: '3.14' + docker_python_exe_path: '/opt/python/cp314-cp314/bin/python3.14' + - python_version: '3.14t' + docker_python_exe_path: '/opt/python/cp314-cp314t/bin/python3.14' diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index f22b59218db1e..c6ad801fe4aa4 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -60,7 +60,17 @@ parameters: - name: docker_base_image type: string - displayName: 'Linux docker base image' + displayName: 'Linux x86_64 docker base image' + +- name: docker_base_image_aarch64 + type: string + displayName: 'Linux aarch64 docker base image' + default: '' + +- name: AArch64LinuxPythonConfigurations + type: object + displayName: 'aarch64 Linux Python build configurations' + default: [] stages: # Use separated cudnn folder for CUDA 13.0 on Windows. @@ -102,11 +112,30 @@ stages: ${{ if eq(config.python_version, '3.12') }}: build_intermediates_artifact_name: linux_gpu_wheel_x86_64 + # Linux aarch64: one parallel stage per Python version + - ${{ each config in parameters.AArch64LinuxPythonConfigurations }}: + - template: py-linux-gpu-stage.yml + parameters: + stage_name: Linux_py_GPU_Wheels_aarch64_${{ replace(config.python_version, '.', '_') }} + arch: 'aarch64' + machine_pool: 'onnxruntime-linux-ARM64-CPU-2019' + extra_build_arg: ${{ parameters.build_py_parameters }} + cmake_build_type: ${{ parameters.cmake_build_type }} + cuda_version: ${{ parameters.cuda_version }} + docker_base_image: ${{ parameters.docker_base_image_aarch64 }} + python_version: ${{ config.python_version }} + docker_python_exe_path: ${{ config.docker_python_exe_path }} + wheel_artifact_name: onnxruntime_gpu_aarch64_${{ replace(config.python_version, '.', '_') }} + ${{ if eq(config.python_version, '3.12') }}: + build_intermediates_artifact_name: linux_gpu_wheel_aarch64 + # Merge per-version Linux wheel artifacts into a single combined artifact for downstream consumers - stage: Linux_py_GPU_Wheels_Merge_Artifacts dependsOn: - ${{ each config in parameters.LinuxPythonConfigurations }}: - Linux_py_GPU_Wheels_x86_64_${{ replace(config.python_version, '.', '_') }} + - ${{ each config in parameters.AArch64LinuxPythonConfigurations }}: + - Linux_py_GPU_Wheels_aarch64_${{ replace(config.python_version, '.', '_') }} jobs: - job: Linux_py_GPU_Wheels_Merge_Artifacts workspace: @@ -130,3 +159,10 @@ stages: inputs: artifact: onnxruntime_gpu_${{ replace(config.python_version, '.', '_') }} targetPath: $(Build.ArtifactStagingDirectory)/onnxruntime_gpu + + - ${{ each config in parameters.AArch64LinuxPythonConfigurations }}: + - task: DownloadPipelineArtifact@2 + displayName: 'Download wheel - aarch64 Python ${{ config.python_version }}' + inputs: + artifact: onnxruntime_gpu_aarch64_${{ replace(config.python_version, '.', '_') }} + targetPath: $(Build.ArtifactStagingDirectory)/onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml index d8793c147477d..47ccd4cd2fe73 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml @@ -63,6 +63,8 @@ stages: pool: name: ${{ parameters.machine_pool }} os: linux + ${{ if eq(parameters.arch, 'aarch64') }}: + hostArchitecture: Arm64 templateContext: outputs: - output: pipelineArtifact @@ -80,10 +82,17 @@ stages: value: '' - template: ../templates/common-variables.yml - name: trt_version - ${{ if eq(parameters.cuda_version, '13.0') }}: + ${{ if eq(parameters.arch, 'aarch64') }}: + value: ${{ variables.aarch64_trt_version }} + ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}: value: ${{ variables.linux_trt_version_cuda13 }} - ${{ if eq(parameters.cuda_version, '12.8') }}: + ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '12.8')) }}: value: ${{ variables.linux_trt_version_cuda12 }} + - name: trt_download_url + ${{ if and(eq(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}: + value: ${{ variables.aarch64_trt_download_url_cuda13 }} + ${{ else }}: + value: '' steps: - checkout: self clean: true @@ -99,7 +108,7 @@ stages: parameters: Dockerfile: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda/Dockerfile Context: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda - DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg BUILD_UID=$( id -u )" + DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg TRT_DOWNLOAD_URL=${{ variables.trt_download_url }} --build-arg BUILD_UID=$( id -u )" Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml index 1191e9e98eef1..8c8dae9820810 100644 --- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml +++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml @@ -1,8 +1,11 @@ variables: cuda12_trt_version: '10.14.1.48' cuda13_trt_version: '10.14.1.48' + aarch64_trt_version: '10.15.1.29' # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below linux_trt_version_cuda13: ${{ variables.cuda13_trt_version }}-1.cuda13.0 linux_trt_version_cuda12: ${{ variables.cuda12_trt_version }}-1.cuda12.9 + # aarch64 TRT tar download (no RPMs available for aarch64) + aarch64_trt_download_url_cuda13: https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.15.1/tars/TensorRT-${{ variables.aarch64_trt_version }}.Linux.aarch64-gnu.cuda-13.1.tar.gz win_trt_folder_cuda13: TensorRT-${{ variables.cuda13_trt_version }}.Windows.win10.cuda-13.0 win_trt_folder_cuda12: TensorRT-${{ variables.cuda12_trt_version }}.Windows.win10.cuda-12.9 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml index 1dde96e21a636..afcba73456558 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml @@ -45,10 +45,17 @@ jobs: - name: skipComponentGovernanceDetection value: true - name: trt_version - ${{ if eq(parameters.cuda_version, '13.0') }}: + ${{ if eq(parameters.arch, 'aarch64') }}: + value: ${{ variables.aarch64_trt_version }} + ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}: value: ${{ variables.linux_trt_version_cuda13 }} - ${{ if eq(parameters.cuda_version, '12.8') }}: + ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '12.8')) }}: value: ${{ variables.linux_trt_version_cuda12 }} + - name: trt_download_url + ${{ if and(eq(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}: + value: ${{ variables.aarch64_trt_download_url_cuda13 }} + ${{ else }}: + value: '' workspace: clean: all pool: ${{ parameters.machine_pool }} @@ -77,7 +84,7 @@ jobs: parameters: Dockerfile: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda/Dockerfile Context: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda - DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg BUILD_UID=$( id -u )" + DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg TRT_DOWNLOAD_URL=${{ variables.trt_download_url }} --build-arg BUILD_UID=$( id -u )" Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} - task: Bash@3 diff --git a/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml index 212eca44ae3ec..e92eb0dafadcb 100644 --- a/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml +++ b/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml @@ -59,7 +59,10 @@ steps: print("##vso[task.logissue type=error]Failed to get git info: {}".format(e)) sys.exit(1) version_string = "{}-dev.{}+{}".format(original_ver, date_str, commit_sha) - universal_version = "{}-dev.{}.{}".format(original_ver, date_str, commit_sha) + # Prefix the SHA with "commit-" so the pre-release identifier always contains a + # non-digit. Otherwise, an all-numeric short SHA with a leading zero (e.g. "01234567") + # would violate SemVer 2.0.0's rule against leading zeros in numeric identifiers. + universal_version = "{}-dev.{}.commit-{}".format(original_ver, date_str, commit_sha) else: print("##vso[task.logissue type=error]Unknown package_version '{}'. Must be 'release', 'RC', or 'dev'.".format(package_version)) @@ -74,10 +77,10 @@ steps: print("##vso[task.logissue type=error]Version string '{}' is not valid semver 2.0.0.".format(version_string)) sys.exit(1) - # Validate universal version (SemVer 1.0.0 - no build metadata) + # Validate universal version (SemVer 2.0.0, without build metadata) universal_semver_pattern = r"^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?$" if not re.match(universal_semver_pattern, universal_version): - print("##vso[task.logissue type=error]Universal version string '{}' is not valid semver 1.0.0.".format(universal_version)) + print("##vso[task.logissue type=error]Universal version string '{}' is not valid semver 2.0.0 (without build metadata).".format(universal_version)) sys.exit(1) print("##vso[task.setvariable variable=PluginPackageVersion]{}".format(version_string)) diff --git a/tools/ci_build/github/js/validate-npm-packages.py b/tools/ci_build/github/js/validate-npm-packages.py index 73f76ea8bbb5b..2917331b84c07 100644 --- a/tools/ci_build/github/js/validate-npm-packages.py +++ b/tools/ci_build/github/js/validate-npm-packages.py @@ -113,8 +113,13 @@ print(f"##vso[task.setvariable variable=ORT_COMMON_FROM]{ort_common_from}") if tag == "latest" or tag == "" or tag == "rc": - if not RELEASE_NODE or not RELEASE_WEB or not RELEASE_REACT_NATIVE: - raise Exception("@latest or @rc build must release all packages (node, web, react-native)") + # FUTURE WORK: We will either punt `react-native` out of the core package set, or fix it and re-incorporate it. + # Which one is TBD, but for now we are not requiring `react-native` for @latest or @rc builds. + if not RELEASE_NODE or not RELEASE_WEB: + raise Exception("@latest or @rc build must release the following packages: node, web") + if not RELEASE_REACT_NATIVE: + print("WARNING - @latest or @rc build should release `react-native` package. This is temporarily not required.") + if count_ort_node_common_tgz != 1: raise Exception("expect one package file for onnxruntime-common for release build") diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh index 27bf6f9b9e1d1..7ba5406e00ec0 100755 --- a/tools/ci_build/github/linux/build_linux_python_package.sh +++ b/tools/ci_build/github/linux/build_linux_python_package.sh @@ -80,8 +80,20 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then fi SHORT_CUDA_VERSION=$(echo "$CUDA_VERSION" | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/') - #Enable CUDA and TRT EPs. - BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF") + CUDA_HOME=/usr/local/cuda-$SHORT_CUDA_VERSION + if [ ! -d "$CUDA_HOME" ] && [ -d /usr/local/cuda ]; then + # Allow the cu13 packaging flow to run on images that expose a newer CUDA minor version via /usr/local/cuda. + CUDA_HOME=/usr/local/cuda + fi + #Enable CUDA EP. + BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF") + # Enable TRT EP only if TensorRT is installed. + if [ -f /usr/include/NvInfer.h ]; then + BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/usr") + elif [ "$ARCH" != "aarch64" ] && [ -f /opt/tensorrt/include/NvInfer.h ]; then + # The aarch64 TensorRT tarball is not compatible with the packaging image's glibc baseline. + BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/opt/tensorrt") + fi fi if [ "$BUILD_DEVICE" == "WEBGPU" ]; then BUILD_ARGS+=("--use_webgpu") diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/Dockerfile new file mode 100644 index 0000000000000..b960961a20336 --- /dev/null +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/Dockerfile @@ -0,0 +1,48 @@ +# The default ARGs are for cuda 12.8 with cudnn9, TensorRT is optional +# Please overwrite BASEIMAGE, TRT_VERSION and other arguments with +# --docker-build-args ' --build-arg BASEIMAGE=other_base_image --build-arg TRT_VERSION=other_trt_version etc...' +# for other cuda version and TRT version +ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubi8 + +FROM $BASEIMAGE +ARG TRT_VERSION +# For aarch64 tar-based TensorRT install +ARG TRT_DOWNLOAD_URL="" +ARG TENSORRT_ROOT=/opt/tensorrt + +# Install TensorRT: use tar download for aarch64 since RPMs are not available +RUN set -eux; \ + if [ -z "${TRT_VERSION}" ]; then \ + echo "TRT_VERSION is empty; skipping TensorRT installation"; \ + elif [ -n "${TRT_DOWNLOAD_URL}" ]; then \ + echo "Installing TensorRT ${TRT_VERSION} from tar"; \ + mkdir -p /tmp/trt "${TENSORRT_ROOT}"; \ + curl -fsSL "${TRT_DOWNLOAD_URL}" -o /tmp/trt/tensorrt.tar.gz; \ + tar -xzf /tmp/trt/tensorrt.tar.gz -C /tmp/trt; \ + extracted_dir="$(find /tmp/trt -mindepth 1 -maxdepth 1 -type d | head -n 1)"; \ + cp -a "${extracted_dir}/." "${TENSORRT_ROOT}/"; \ + rm -rf /tmp/trt; \ + if [ -d "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/lib" ] && [ ! -e "${TENSORRT_ROOT}/lib" ]; then \ + ln -s "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/lib" "${TENSORRT_ROOT}/lib"; \ + fi; \ + if [ -d "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/include" ] && [ ! -e "${TENSORRT_ROOT}/include" ]; then \ + ln -s "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/include" "${TENSORRT_ROOT}/include"; \ + fi; \ + else \ + echo "TRT_VERSION is ${TRT_VERSION} but no TRT_DOWNLOAD_URL provided; skipping"; \ + fi + +ENV TENSORRT_ROOT=${TENSORRT_ROOT} +ENV PATH=${TENSORRT_ROOT}/bin:/usr/local/cuda/bin:${PATH} +ENV LD_LIBRARY_PATH=${TENSORRT_ROOT}/lib:${LD_LIBRARY_PATH} +ENV CPATH=${TENSORRT_ROOT}/include:${CPATH} +ENV CUDA_MODULE_LOADING="LAZY" + +ADD scripts /tmp/scripts +RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts + +ARG BUILD_UID=1001 +ARG BUILD_USER=onnxruntimedev +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER +USER $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/scripts/install_centos.sh new file mode 100755 index 0000000000000..d90683c468627 --- /dev/null +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/scripts/install_centos.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) + +echo "installing for os major version : $os_major_version" +if [ "$os_major_version" -ge 9 ]; then + dnf install -y glibc-langpack-\* which expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget +else + dnf install -y glibc-langpack-\* which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget +fi diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile index 7a29fd7fc728c..03f351d942e70 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile @@ -22,8 +22,8 @@ RUN dnf install -y --nodocs \ && dnf clean all \ && rm -rf /var/cache/dnf -ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2025.4.1 -ARG OPENVINO_PACKAGE_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.4.1/linux/openvino_toolkit_rhel8_2025.4.1.20426.82bbf0292c5_x86_64.tgz +ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2026.1.0 +ARG OPENVINO_PACKAGE_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.1/linux/openvino_toolkit_rhel8_2026.1.0.21367.63e31528c62_x86_64.tgz ARG TEMP_DIR=/tmp/openvino_installer RUN mkdir -p ${TEMP_DIR} && \ diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh index 246bc076fd5b3..e1856e51e3c9c 100755 --- a/tools/ci_build/github/linux/run_python_tests.sh +++ b/tools/ci_build/github/linux/run_python_tests.sh @@ -9,7 +9,7 @@ BUILD_CONFIG="Release" while getopts "d:c:" parameter_Option do case "${parameter_Option}" in -#GPU or CPU. +#GPU or CPU. d) BUILD_DEVICE=${OPTARG};; c) BUILD_CONFIG=${OPTARG};; esac @@ -38,8 +38,20 @@ if [ $ARCH == "x86_64" ]; then fi if [ $BUILD_DEVICE == "GPU" ]; then SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed 's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/') + CUDA_HOME=/usr/local/cuda-$SHORT_CUDA_VERSION + if [ ! -d "$CUDA_HOME" ] && [ -d /usr/local/cuda ]; then + # Allow the cu13 packaging flow to run on images that expose a newer CUDA minor version via /usr/local/cuda. + CUDA_HOME=/usr/local/cuda + fi - BUILD_ARGS="$BUILD_ARGS --use_cuda --use_tensorrt --cuda_version=$SHORT_CUDA_VERSION --tensorrt_home=/usr --cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION --cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" + BUILD_ARGS="$BUILD_ARGS --use_cuda --cuda_version=$SHORT_CUDA_VERSION --cuda_home=$CUDA_HOME --cudnn_home=$CUDA_HOME" + # Enable TRT EP only if TensorRT is installed. + if [ -f /usr/include/NvInfer.h ]; then + BUILD_ARGS="$BUILD_ARGS --use_tensorrt --tensorrt_home=/usr" + elif [ "$ARCH" != "aarch64" ] && [ -f /opt/tensorrt/include/NvInfer.h ]; then + # The aarch64 TensorRT tarball is not compatible with the packaging image's glibc baseline. + BUILD_ARGS="$BUILD_ARGS --use_tensorrt --tensorrt_home=/opt/tensorrt" + fi fi python3 -m pip install --upgrade pip @@ -47,7 +59,7 @@ python3 -m pip install --upgrade pip python3 -m pip install -r /build/$BUILD_CONFIG/requirements.txt # Install the packages that are needed for running test scripts python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/python/requirements.txt -# The "--no-index" flag is crucial. The local whl folder is just an additional source. Pypi's doc says "there is no +# The "--no-index" flag is crucial. The local whl folder is just an additional source. Pypi's doc says "there is no # ordering in the locations that are searched" if we don't disable the default one with "--no-index" python3 -m pip install --no-index --find-links /build/whl $PYTHON_PACKAGE_NAME cd /build/$BUILD_CONFIG