diff --git a/CMakeLists.txt b/CMakeLists.txt index c9bd226..b978649 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,9 +4,16 @@ project(Sofie DESCRIPTION "SOFIE" LANGUAGES CXX) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + find_package(BLAS) if(NOT BLAS_FOUND) - message(WARNING "BLAS not found: TMVA-SOFIE will not be fully tested") + message(WARNING "BLAS not found: sofie will not be fully tested") endif() message(STATUS "Looking for Protobuf") @@ -17,49 +24,97 @@ if(NOT Protobuf_FOUND) endif() if(NOT Protobuf_FOUND) if(fail-on-missing) - message(FATAL_ERROR "Protobuf libraries not found and they are required (tmva-sofie option enabled)") + message(FATAL_ERROR "Protobuf libraries not found and they are required (sofie option enabled)") else() - message(STATUS "Protobuf not found. Switching off tmva-sofie option") + message(STATUS "Protobuf not found. Switching off sofie option") message(FATAL_ERROR "SOFIE cannot be installed without Protobuf") endif() else() if(Protobuf_VERSION LESS 3.0) if(fail-on-missing) - message(FATAL_ERROR "Protobuf libraries found but is less than the version required (3.0) (tmva-sofie option enabled)") + message(FATAL_ERROR "Protobuf libraries found but is less than the version required (3.0) (sofie option enabled)") else() - message(STATUS "Protobuf found but its version is not high enough (>3.0). Switching off tmva-sofie option") + message(STATUS "Protobuf found but its version is not high enough (>3.0). Switching off sofie option") message(FATAL_ERROR "SOFIE cannot be installed without Protobuf") endif() else() if(NOT TARGET protobuf::protoc) if(fail-on-missing) - message(FATAL_ERROR "Protobuf compiler not found (tmva-sofie option enabled)") + message(FATAL_ERROR "Protobuf compiler not found (sofie option enabled)") else() - message(STATUS "Protobuf compiler not found. Switching off tmva-sofie option") + message(STATUS "Protobuf compiler not found. Switching off sofie option") message(FATAL_ERROR "SOFIE cannot be installed without Protobuf") endif() endif() endif() endif() -find_package(ROOT REQUIRED COMPONENTS Core TMVA Tree) -include(${ROOT_USE_FILE}) +option(SOFIE_WITH_ROOT "Enable ROOT support (required for .root weight files and ROOT serialization)" OFF) + +if(SOFIE_WITH_ROOT) + find_package(ROOT REQUIRED COMPONENTS Core TMVA Tree) + if(ROOT_FOUND) + include(${ROOT_USE_FILE}) + message(STATUS "ROOT found: enabling ROOT support in SOFIE") + else() + message(FATAL_ERROR "SOFIE_WITH_ROOT is ON but ROOT was not found") + endif() +else() + message(STATUS "Building SOFIE without ROOT support (SOFIE_WITH_ROOT=OFF)") +endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -set(CMAKE_INSTALL_BINDIR "bin" CACHE PATH "user executables (bin)") -set(CMAKE_INSTALL_INCLUDEDIR "include" CACHE PATH "header files") -set(CMAKE_INSTALL_LIBDIR "lib" CACHE PATH "libraries") if(ccache) set(CMAKE_C_COMPILER_LAUNCHER ccache) set(CMAKE_CXX_COMPILER_LAUNCHER ccache) endif() +option(testing "Build and run tests" OFF) if(testing) - find_package(GTest REQUIRED) + find_package(GTest REQUIRED) enable_testing() endif() -include(cmake/modules/RoottestMacros.cmake) +option(SOFIE_BENCHMARK "Build the SOFIE CUDA benchmark toolkit" OFF) + +if(SOFIE_WITH_ROOT AND ROOT_FOUND) + include(cmake/modules/RoottestMacros.cmake) +else() + include(cmake/modules/SofieTestMacros.cmake) +endif() + +add_subdirectory(utils) +add_subdirectory(core) +add_subdirectory(parsers) + +if(SOFIE_BENCHMARK) + add_subdirectory(benchmark) +endif() + +# ── Install cmake package config files ────────────────────────────────────── + +configure_package_config_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/SOFIEConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfig.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE +) + +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfigVersion.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion +) + +install( + EXPORT SOFIETargets + FILE SOFIETargets.cmake + NAMESPACE SOFIE:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE +) -add_subdirectory(src) +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfigVersion.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE +) diff --git a/README.md b/README.md index 97902f8..23ab074 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,13 @@ Since SOFIE is a part of ROOT and therefore needs to be built altogether, it tak ## Installation -1. Getting a ROOT binary. -Download a pre-built binary of ROOT based on your architecture from [here](https://root.cern/install/). +1. SOFIE requires ROOT by default. To explore SOFIE APIs within ROOT, follow the mentioned steps: + 1. Getting a ROOT binary. Download a pre-built binary of ROOT based on your architecture from [here](https://root.cern/install/). + 2. Source it so that SOFIE is able to include its code. For example: + ``` + source root_v6.36.02.Linux-ubuntu24.04-x86_64-gcc13.3/root/bin/thisroot.sh + ``` + To disable ROOT, simply disable the `-DSOFIE_WITH_ROOT=OFF` cmake flag in the build command. 2. Build standalone SOFIE ```bash @@ -23,9 +28,13 @@ The commands above should build the SOFIE standalone. To include it within the R source setup.sh ``` -Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefull. - +Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefully. This step is not required if SOFIE is built for usage without ROOT (`-DSOFIE_WITH_ROOT=OFF`). +3. To enable testing generated code with alpaka implementations, build using the following command: +```bash +cmake -Dtesting=ON -DENABLE_ALPAKA_TESTS=ON -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +``` +The default architecture is CUDA, but can be configured using an additional`-DALPAKA_BACKEND=hip` cmake option. ## Inspiration The standalone version of SOFIE is developed with inspiration from the standalone version of RooFit developed by Jonas Rembser that can be found [here](https://github.com/guitargeek/roofit). diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 0000000..794aede --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,316 @@ +cmake_minimum_required(VERSION 3.18) +include(FetchContent) + +################################################################################ +# SOFIE Alpaka Benchmark Toolkit +# +# Usage: +# cmake -Bbuild -DSOFIE_BENCHMARK=ON . +# cmake --build build --target sofie_benchmark +# cd build/benchmark && ./sofie_benchmark [options] +# +# To also benchmark with ONNX Runtime GPU: +# cmake -Bbuild -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_ORT=ON \ +# [-DONNXRUNTIME_ROOT=/usr/local/onnxruntime] . +# ./sofie_benchmark --onnxruntime +# +# Place .onnx models in benchmark/models/ and re-run cmake to register them. +################################################################################ + +option(SOFIE_BENCHMARK_ORT + "Also benchmark ONNX Runtime GPU alongside SOFIE (requires ORT ≥ 1.18)" + OFF) + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../core/inc + ${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc +) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +################################################################################ +# Discover models +################################################################################ + +file(GLOB BENCHMARK_ONNX_MODELS + "${CMAKE_CURRENT_SOURCE_DIR}/models/*.onnx") + +if(NOT BENCHMARK_ONNX_MODELS) + message(STATUS + "SOFIE Benchmark: No .onnx models found in benchmark/models/. " + "Add ONNX models there and re-run cmake to enable benchmarking.") + return() +endif() + +list(LENGTH BENCHMARK_ONNX_MODELS N_MODELS) +message(STATUS "SOFIE Benchmark: Found ${N_MODELS} model(s) in benchmark/models/") + +################################################################################ +# Fetch Alpaka and sofieBLAS (same pinned revisions as the test suite) +################################################################################ + +FetchContent_Declare( + sofieBLAS + GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS + GIT_TAG dev +) +FetchContent_MakeAvailable(sofieBLAS) + +FetchContent_Declare( + alpaka + GIT_REPOSITORY https://github.com/alpaka-group/alpaka + GIT_TAG 2fa91a34ed11b2076e474c5507d920e85cf9b79d +) +FetchContent_MakeAvailable(alpaka) + +################################################################################ +# CUDA (same as the test suite) +################################################################################ + +enable_language(CUDA) +find_package(CUDAToolkit REQUIRED) +message(STATUS "SOFIE Benchmark: CUDA backend (${CUDAToolkit_VERSION})") + +################################################################################ +# Optional: ONNX Runtime GPU backend +################################################################################ + +set(SOFIE_ORT_FOUND FALSE) + +if(SOFIE_BENCHMARK_ORT) + # Prefer manual detection — the installed ORT CMake config may reference + # a wrong lib path (e.g. lib64 vs lib) and raise a hard error even with QUIET. + # If ONNXRUNTIME_ROOT is provided, go straight to the manual path. + # Otherwise attempt the CMake config with NO_DEFAULT_PATH so it only looks + # where we tell it, and fall through to manual on failure. + + set(_ort_search_roots "") + if(DEFINED ONNXRUNTIME_ROOT) + list(APPEND _ort_search_roots "${ONNXRUNTIME_ROOT}") + endif() + list(APPEND _ort_search_roots + /usr/local/onnxruntime /usr/local /usr /opt) + + # Manual header + library search (reliable, no broken cmake-config risk) + find_path(ONNXRUNTIME_INCLUDE_DIR + NAMES onnxruntime_cxx_api.h + PATHS ${_ort_search_roots} + PATH_SUFFIXES include include/onnxruntime + NO_DEFAULT_PATH) + + find_library(ONNXRUNTIME_LIBRARY + NAMES onnxruntime + PATHS ${_ort_search_roots} + PATH_SUFFIXES lib lib64 + NO_DEFAULT_PATH) + + if(ONNXRUNTIME_INCLUDE_DIR AND ONNXRUNTIME_LIBRARY) + set(SOFIE_ORT_FOUND TRUE) + add_library(onnxruntime::onnxruntime SHARED IMPORTED) + set_target_properties(onnxruntime::onnxruntime PROPERTIES + IMPORTED_LOCATION "${ONNXRUNTIME_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ONNXRUNTIME_INCLUDE_DIR}") + set(SOFIE_ORT_TARGET onnxruntime::onnxruntime) + message(STATUS "SOFIE Benchmark: ONNX Runtime found — ${ONNXRUNTIME_LIBRARY}") + message(STATUS "SOFIE Benchmark: ORT headers — ${ONNXRUNTIME_INCLUDE_DIR}") + else() + message(WARNING + "SOFIE Benchmark: SOFIE_BENCHMARK_ORT=ON but ONNX Runtime " + "not found. Set -DONNXRUNTIME_ROOT= or install ORT. " + "ORT benchmarking will be disabled.") + endif() +endif() + +if(SOFIE_BENCHMARK_ORT AND NOT SOFIE_ORT_FOUND) + message(STATUS "SOFIE Benchmark: ORT benchmarking disabled (library not found)") +endif() + +################################################################################ +# Build per-model strings for configure_file +################################################################################ + +set(_EMIT_BLOCK +"try {\n\ + EmitBenchmarkModel(\"@1@\", \"@2@\", outDir);\n\ +} catch (const std::exception &e) {\n\ + std::cerr << \"[ERROR] @2@: \" << e.what() << \"\\n\";\n\ + ++failures;\n\ +} catch (...) {\n\ + std::cerr << \"[ERROR] @2@: unknown exception\\n\";\n\ + ++failures;\n\ +}\n\ +") + +set(_RUN_BLOCK +" Benchmark_@3@(warmup, iterations, weightsDir);\n\ +") + +# ORT call: passes the full ONNX path directly (no SOFIE weights needed) +set(_ORT_RUN_BLOCK +"#ifdef SOFIE_BENCHMARK_ORT\n\ + if (run_ort) BenchmarkORT_GPU(\"@1@\", \"@2@\", warmup, iterations);\n\ +#endif\n\ +") + +set(BENCHMARK_EMIT_CAPTURES "") +set(BENCHMARK_BENCH_HEADERS "") +set(BENCHMARK_RUN_CALLS "") +set(GENERATED_HEADERS "") + +foreach(ONNX_FILE ${BENCHMARK_ONNX_MODELS}) + get_filename_component(MODEL_NAME "${ONNX_FILE}" NAME_WE) + + string(REGEX REPLACE "[^A-Za-z0-9]" "_" MODEL_CPPNAME "${MODEL_NAME}") + + set(GEN_HXX "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_GPU_ALPAKA.hxx") + set(GEN_BENCH "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.hxx") + list(APPEND GENERATED_HEADERS "${GEN_HXX}" "${GEN_BENCH}") + + string(REPLACE "@1@" "${ONNX_FILE}" _emit_cap "${_EMIT_BLOCK}") + string(REPLACE "@2@" "${MODEL_NAME}" _emit_cap "${_emit_cap}") + string(APPEND BENCHMARK_EMIT_CAPTURES "${_emit_cap}") + + # SOFIE Alpaka call + string(REPLACE "@3@" "${MODEL_CPPNAME}" _run_cap "${_RUN_BLOCK}") + string(APPEND BENCHMARK_RUN_CALLS "${_run_cap}") + + # ORT call (guarded by #ifdef at compile time + run_ort flag at runtime) + string(REPLACE "@1@" "${ONNX_FILE}" _ort_cap "${_ORT_RUN_BLOCK}") + string(REPLACE "@2@" "${MODEL_NAME}" _ort_cap "${_ort_cap}") + string(APPEND BENCHMARK_RUN_CALLS "${_ort_cap}") + + string(APPEND BENCHMARK_BENCH_HEADERS + "#include \"${MODEL_NAME}_bench.hxx\"\n") +endforeach() + +################################################################################ +# Configure emitter and runner sources +################################################################################ + +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkEmitter.cxx.in" + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_all.cxx" + @ONLY +) + +set(RUNNER_SRC "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_all.cu") +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkRunner.cxx.in" + "${RUNNER_SRC}" + @ONLY +) + +################################################################################ +# Emitter executable (plain C++, generates SOFIE headers at build time) +################################################################################ + +add_executable(sofie_benchmark_emitter + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_all.cxx" +) + +target_include_directories(sofie_benchmark_emitter PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../core/inc" + "${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc" +) + +target_link_libraries(sofie_benchmark_emitter PRIVATE + SOFIE_core + SOFIE_parsers + protobuf::libprotobuf +) + +target_compile_options(sofie_benchmark_emitter PRIVATE + -Wno-unused-parameter + -Wno-array-bounds +) + +################################################################################ +# Custom command: run emitter → generate inference + benchmark headers +################################################################################ + +add_custom_command( + OUTPUT ${GENERATED_HEADERS} + COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND "$" "${CMAKE_CURRENT_BINARY_DIR}" + DEPENDS sofie_benchmark_emitter ${BENCHMARK_ONNX_MODELS} + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + COMMENT "SOFIE Benchmark: generating headers for ${N_MODELS} model(s)..." + VERBATIM +) + +add_custom_target(sofie_benchmark_headers + DEPENDS ${GENERATED_HEADERS} +) + +################################################################################ +# Benchmark runner (compiled as .cu, same as the test suite) +################################################################################ + +set_source_files_properties("${RUNNER_SRC}" PROPERTIES LANGUAGE CUDA) + +add_executable(sofie_benchmark "${RUNNER_SRC}") + +add_dependencies(sofie_benchmark sofie_benchmark_headers) + +target_include_directories(sofie_benchmark PRIVATE + "${CMAKE_CURRENT_BINARY_DIR}" # generated headers live here + "${CMAKE_CURRENT_SOURCE_DIR}/src" # ONNXRuntimeBenchmark.hxx + "${alpaka_SOURCE_DIR}/include" + "${sofieblas_SOURCE_DIR}/include" + "${CUDAToolkit_INCLUDE_DIRS}" +) + +set_target_properties(sofie_benchmark PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF + CUDA_ARCHITECTURES "70;75;80;86;89;90" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" +) + +target_compile_definitions(sofie_benchmark PRIVATE + ALPAKA_ACC_GPU_CUDA_ENABLED + ALPAKA_HAS_STD_ATOMIC_REF + $<$:SOFIE_BENCHMARK_ORT> +) + +target_compile_options(sofie_benchmark PRIVATE + $<$: + --extended-lambda + --expt-relaxed-constexpr + --use_fast_math + -O2 + -Wno-deprecated-gpu-targets + > + $<$: + -O2 + -fPIC + > +) + +target_link_libraries(sofie_benchmark PRIVATE + SOFIE_core + CUDA::cudart + CUDA::cublas + CUDA::cublasLt + $<$:${SOFIE_ORT_TARGET}> +) + +if(SOFIE_ORT_FOUND) + message(STATUS "SOFIE Benchmark: target 'sofie_benchmark' configured " + "(${N_MODELS} model(s), CUDA backend + ORT-GPU)") +else() + message(STATUS "SOFIE Benchmark: target 'sofie_benchmark' configured " + "(${N_MODELS} model(s), CUDA backend; " + "re-configure with -DSOFIE_BENCHMARK_ORT=ON for ORT comparison)") +endif() + +# Convenience CTest entry +if(testing) + add_test( + NAME SofieBenchmark + COMMAND sofie_benchmark --warmup 5 --iterations 20 + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + ) +endif() diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..1a9a0f5 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,95 @@ +# SOFIE Alpaka Benchmark Toolkit + +Measures **inference latency and throughput** for ONNX models compiled by SOFIE and +executed via Alpaka (CUDA backend). Optionally runs the same models through +**ONNX Runtime GPU** for a side-by-side comparison. + +--- + +## Quick Start + +### 1. Add your models + +``` +benchmark/models/ + GNN_model.onnx + simple_transformer.onnx + resnet50.onnx + ... +``` + +Re-run CMake after adding or removing files (it globs `models/*.onnx`). + +### 2. Configure + +```bash +# SOFIE inference only (default) +cmake -B build -DSOFIE_BENCHMARK=ON /path/to/SOFIE + +# With ONNX Runtime GPU comparison +cmake -B build \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_ORT=ON \ + -DONNXRUNTIME_ROOT=/path/to/onnxruntime \ + /path/to/SOFIE +``` + +| CMake flag | Default | Description | +|---|---|---| +| `-DSOFIE_BENCHMARK=ON` | — | Enable the benchmark suite | +| `-DSOFIE_BENCHMARK_ORT=ON` | `OFF` | Also benchmark ONNX Runtime GPU | +| `-DONNXRUNTIME_ROOT=` | — | Hint for finding ORT headers/library | + +> **Tested with ONNX Runtime 1.22.0 GPU** +> (`onnxruntime-linux-x64-gpu-1.22.0`). The CMake config bundled with some ORT +> installations may reference an incorrect `lib64/` path — this toolkit uses manual +> header/library detection to avoid that. + +### 3. Build + +```bash +cmake --build build --target sofie_benchmark -j$(nproc) +``` + +This automatically: +1. Builds **`sofie_benchmark_emitter`** — parses each `.onnx` and emits: + - `_GPU_ALPAKA.hxx` — SOFIE CUDA/Alpaka inference code + - `_GPU_ALPAKA.dat` — serialized weights + - `_bench.hxx` — timing wrapper `Benchmark_()` +2. Builds **`sofie_benchmark`** — compiles all generated code as `.cu` and links the + timing loop. + +### 4. Run + +```bash +cd build/benchmark + +# SOFIE only (no ORT needed at runtime) +./sofie_benchmark + +# SOFIE + ONNX Runtime GPU comparison +LD_LIBRARY_PATH=/path/to/onnxruntime/lib:$LD_LIBRARY_PATH \ +./sofie_benchmark --onnxruntime +``` + +--- + +## Runtime Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--warmup, -w ` | 10 | Warm-up iterations (not timed) | +| `--iterations, -n ` | 100 | Timed iterations | +| `--weights-dir ` | `.` | Directory containing `.dat` weight files | +| `--onnxruntime, --ort` | off | Run ONNX Runtime GPU benchmark after each SOFIE model | +| `--help, -h` | | Print this help and exit | + + +--- + +## Re-running after adding models + +```bash +cmake build # re-configure (re-globs) +cmake --build build --target sofie_benchmark -j$(nproc) # re-build +``` diff --git a/benchmark/models/.gitkeep b/benchmark/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/models/GNN_model.onnx b/benchmark/models/GNN_model.onnx new file mode 100644 index 0000000..833e34d Binary files /dev/null and b/benchmark/models/GNN_model.onnx differ diff --git a/benchmark/models/simple_transformer.onnx b/benchmark/models/simple_transformer.onnx new file mode 100644 index 0000000..1925d9d Binary files /dev/null and b/benchmark/models/simple_transformer.onnx differ diff --git a/benchmark/models/simple_transformer.onnx.data b/benchmark/models/simple_transformer.onnx.data new file mode 100644 index 0000000..3f52857 Binary files /dev/null and b/benchmark/models/simple_transformer.onnx.data differ diff --git a/benchmark/src/BenchmarkBackend.hxx b/benchmark/src/BenchmarkBackend.hxx new file mode 100644 index 0000000..6e7987e --- /dev/null +++ b/benchmark/src/BenchmarkBackend.hxx @@ -0,0 +1,37 @@ +#pragma once +// Backend type aliases — selected at compile time by CMake via -DSOFIE_BACKEND_* +// The runner and every generated meta-header use these aliases so they +// stay free of any backend-specific API (cuda_runtime.h, hip_runtime.h, …). + +#include + +namespace sofie_bench { + +using Idx = std::size_t; +using Dim1 = alpaka::DimInt<1>; +using Ext1 = alpaka::Vec; + +#if defined(SOFIE_BACKEND_CUDA) + + using AccTag = alpaka::TagGpuCudaRt; + using Platform = alpaka::PlatformCudaRt; + using Device = alpaka::DevCudaRt; + using Queue = alpaka::Queue; + +#elif defined(SOFIE_BACKEND_HIP) + + using AccTag = alpaka::TagGpuHipRt; + using Platform = alpaka::PlatformHipRt; + using Device = alpaka::DevHipRt; + using Queue = alpaka::Queue; + +#else // CPU serial (default / fallback) + + using AccTag = alpaka::TagCpuSerial; + using Platform = alpaka::PlatformCpu; + using Device = alpaka::DevCpu; + using Queue = alpaka::Queue; + +#endif + +} // namespace sofie_bench diff --git a/benchmark/src/BenchmarkEmitter.cxx.in b/benchmark/src/BenchmarkEmitter.cxx.in new file mode 100644 index 0000000..cc02357 --- /dev/null +++ b/benchmark/src/BenchmarkEmitter.cxx.in @@ -0,0 +1,220 @@ +// SOFIE Benchmark Emitter +// Auto-configured by CMake — do not edit directly. +// For each .onnx model in benchmark/models/ this binary generates: +// _GPU_ALPAKA.hxx — SOFIE inference code +// _GPU_ALPAKA.dat — serialized weights +// _bench.hxx — timing function, following the same +// pattern as the unit tests + +#include "SOFIE/RModel_Base.hxx" +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelParser_ONNX.hxx" + +#include +#include +#include +#include +#include +#include + +using namespace SOFIE; + +// Resolve a single Dim to a concrete size; dynamic dims default to 1. +static size_t resolveDim(const Dim &d) { + return (d.dim > 0) ? static_cast(d.dim) : 1u; +} + +static int EmitBenchmarkModel(const std::string &onnxPath, + const std::string &modelName, + const std::string &outDir) +{ + std::cout << "[Benchmark] Processing: " << onnxPath << "\n"; + + RModelParser_ONNX parser; + RModel model = parser.Parse(onnxPath); + + const auto &inputNames = model.GetInputTensorNames(); + if (inputNames.empty()) { + std::cerr << "[WARN] " << modelName << " has no inputs – skipping.\n"; + return 1; + } + + // Map SOFIE tensor type to C++ type string + auto tensorTypeToCpp = [](ETensorType t) -> std::string { + switch (t) { + case ETensorType::FLOAT: return "float"; + case ETensorType::DOUBLE: return "double"; + case ETensorType::INT32: return "int32_t"; + case ETensorType::INT64: return "int64_t"; + case ETensorType::UINT8: return "uint8_t"; + case ETensorType::INT8: return "int8_t"; + case ETensorType::UINT16: return "uint16_t"; + case ETensorType::INT16: return "int16_t"; + case ETensorType::UINT32: return "uint32_t"; + case ETensorType::UINT64: return "uint64_t"; + case ETensorType::BOOL: return "uint8_t"; + default: return "float"; + } + }; + + // Collect input metadata before code generation + struct InputMeta { + std::string cppType; + size_t numElements; + }; + std::vector inputs; + for (const auto &n : inputNames) { + InputMeta m; + try { m.cppType = tensorTypeToCpp(model.GetTensorType(n)); } + catch (...) { m.cppType = "float"; } + m.numElements = 1; + try { + for (const auto &d : model.GetDimTensorShape(n)) + m.numElements *= resolveDim(d); + } catch (...) {} + inputs.push_back(m); + } + + // Generate SOFIE GPU/Alpaka inference code + model.GenerateGPU_ALPAKA(); + + std::string hxxPath = outDir + "/" + modelName + "_GPU_ALPAKA.hxx"; + std::string benchPath = outDir + "/" + modelName + "_bench.hxx"; + + model.OutputGenerated(hxxPath); + + // Sanitize model name into a valid C++ identifier + std::string cppName = modelName; + for (char &c : cppName) + if (!std::isalnum(static_cast(c))) c = '_'; + + // Build "session.infer(input_d_0, input_d_1, ...)" + std::ostringstream inferCall; + inferCall << "session.infer("; + for (size_t i = 0; i < inputs.size(); ++i) { + if (i) inferCall << ", "; + inferCall << "input_d_" << i; + } + inferCall << ")"; + + // ----------------------------------------------------------------------- + // Write _bench.hxx — follows the exact same pattern as the unit tests + // in TestCustomModelsFromONNXForAlpakaCuda.cxx, with timing added. + // ----------------------------------------------------------------------- + std::ofstream bench(benchPath); + if (!bench.is_open()) { + std::cerr << "[ERROR] Cannot open " << benchPath << "\n"; + return 1; + } + + bench + << "// Auto-generated benchmark for model: " << modelName << "\n" + << "// DO NOT EDIT — regenerated by the SOFIE benchmark emitter.\n" + << "#pragma once\n\n" + << "#include \"" << modelName << "_GPU_ALPAKA.hxx\"\n" + << "#include \n" + << "#include \n" + << "#include \n" + << "#include \n" + << "#include \n" + << "#include \n\n" + << "inline void Benchmark_" << cppName + << "(int warmup, int iterations, const std::string& weightsDir) {\n" + << " using Idx = std::size_t;\n" + << " using Dim1 = alpaka::DimInt<1>;\n" + << " using Ext1 = alpaka::Vec;\n\n" + << " // ---- Device/host setup (mirrors unit-test pattern) ----\n" + << " alpaka::PlatformCpu hostPlatform{};\n" + << " auto host = alpaka::getDevByIdx(hostPlatform, 0u);\n" + << " alpaka::PlatformCudaRt platform{};\n" + << " auto device = alpaka::getDevByIdx(platform, 0u);\n" + << " alpaka::Queue queue{device};\n\n" + << " std::mt19937 rng(42);\n" + << " std::uniform_real_distribution fdist(-1.0f, 1.0f);\n\n"; + + // Allocate host buffers, fill with data, allocate device buffers, copy + for (size_t i = 0; i < inputs.size(); ++i) { + const std::string &T = inputs[i].cppType; + const size_t N = inputs[i].numElements; + bench + << " // Input " << i << ": " << T << "[" << N << "]\n" + << " auto input_h_" << i << " = alpaka::allocBuf<" << T + << ", Idx>(host, Ext1::all(Idx{" << N << "}));\n" + << " {\n" + << " auto *p = reinterpret_cast<" << T + << "*>(alpaka::getPtrNative(input_h_" << i << "));\n"; + if (T == "float" || T == "double") { + bench + << " for (size_t k = 0; k < " << N + << "; ++k) p[k] = static_cast<" << T << ">(fdist(rng));\n"; + } else { + // Integer/index tensors: fill with zeros so any index value + // is a safe, in-bounds reference into the data arrays. + bench + << " std::fill(p, p + " << N + << ", static_cast<" << T << ">(0));\n"; + } + bench + << " }\n" + << " auto input_d_" << i << " = alpaka::allocBuf<" << T + << ", Idx>(device, Ext1::all(Idx{" << N << "}));\n" + << " alpaka::memcpy(queue, input_d_" << i + << ", input_h_" << i << ");\n\n"; + } + bench << " alpaka::wait(queue);\n\n"; + + // Create session (loads weights from .dat file) + bench + << " // ---- Create session (loads weights) ----\n" + << " std::string weightFile = weightsDir + \"/" + << modelName << "_GPU_ALPAKA.dat\";\n" + << " SOFIE_" << cppName + << "::Session session(weightFile);\n\n"; + + // Warmup (not timed) + bench + << " // ---- Warmup ----\n" + << " for (int w = 0; w < warmup; ++w)\n" + << " " << inferCall.str() << ";\n" + << " alpaka::wait(session.queue);\n" + << " cudaDeviceSynchronize();\n\n"; + + // Timed loop + bench + << " // ---- Timed benchmark ----\n" + << " auto t0 = std::chrono::high_resolution_clock::now();\n" + << " for (int _i = 0; _i < iterations; ++_i)\n" + << " " << inferCall.str() << ";\n" + << " alpaka::wait(session.queue);\n" + << " cudaDeviceSynchronize();\n" + << " auto t1 = std::chrono::high_resolution_clock::now();\n\n" + << " double avg_ms = std::chrono::duration" + "(t1 - t0).count() / iterations;\n" + << " double throughput = (avg_ms > 0.0) ? 1000.0 / avg_ms : 0.0;\n" + << " std::printf(\"%-30s avg %8.4f ms (%8.1f inf/s)\\n\",\n" + << " \"" << modelName << "\", avg_ms, throughput);\n" + << "}\n"; + + bench.close(); + + std::cout << "[Benchmark] Wrote: " << hxxPath << "\n" + << " Wrote: " << benchPath << "\n"; + return 0; +} + +// =========================================================================== +// main() — one EmitBenchmarkModel call per model, injected by CMake +// =========================================================================== +int main(int argc, char *argv[]) { + if (argc < 2) { + std::cerr << "Usage: sofie_benchmark_emitter \n"; + return 1; + } + std::string outDir = argv[1]; + int failures = 0; + +@BENCHMARK_EMIT_CAPTURES@ + + std::cout << "[Benchmark Emitter] Done — " << failures << " failure(s).\n"; + return failures == 0 ? 0 : 1; +} diff --git a/benchmark/src/BenchmarkRunner.cxx.in b/benchmark/src/BenchmarkRunner.cxx.in new file mode 100644 index 0000000..a0a4d61 --- /dev/null +++ b/benchmark/src/BenchmarkRunner.cxx.in @@ -0,0 +1,71 @@ +// SOFIE Alpaka Benchmark Runner +// Auto-configured by CMake — do not edit directly. +// Compiled as .cu so NVCC can compile the generated SOFIE CUDA kernels. + +#include +#include +#include +#include +#include + +// Per-model benchmark functions (generated by the emitter, one per .onnx file) +@BENCHMARK_BENCH_HEADERS@ + +// Optional ONNX Runtime GPU comparison +#ifdef SOFIE_BENCHMARK_ORT +#include "ONNXRuntimeBenchmark.hxx" +#endif + +int main(int argc, char *argv[]) { + int warmup = 10; + int iterations = 100; + std::string weightsDir = "."; + bool run_ort = false; // opt-in via --onnxruntime + + for (int i = 1; i < argc; ++i) { + std::string a = argv[i]; + if ((a == "--warmup" || a == "-w") && i + 1 < argc) warmup = std::stoi(argv[++i]); + else if ((a == "--iterations" || a == "-n") && i + 1 < argc) iterations = std::stoi(argv[++i]); + else if (a == "--weights-dir" && i + 1 < argc) weightsDir = argv[++i]; + else if (a == "--onnxruntime" || a == "--ort") run_ort = true; + else if (a == "--help" || a == "-h") { + std::cout << + "Usage: sofie_benchmark [options]\n" + " --warmup, -w Warmup iterations (default: 10)\n" + " --iterations, -n Timed iterations (default: 100)\n" + " --weights-dir SOFIE .dat weight files (default: .)\n" +#ifdef SOFIE_BENCHMARK_ORT + " --onnxruntime, --ort Also run ONNX Runtime GPU comparison\n" +#else + " --onnxruntime, --ort (not available; rebuild with -DSOFIE_BENCHMARK_ORT=ON)\n" +#endif + ; + return 0; + } + } + +#ifndef SOFIE_BENCHMARK_ORT + if (run_ort) { + std::fprintf(stderr, + "Warning: --onnxruntime requested but this binary was built without " + "ORT support.\n Rebuild with -DSOFIE_BENCHMARK_ORT=ON.\n"); + run_ort = false; + } +#endif + + std::printf("=== SOFIE Alpaka Benchmark ===\n"); + std::printf("Warmup: %d | Iterations: %d", warmup, iterations); +#ifdef SOFIE_BENCHMARK_ORT + if (run_ort) std::printf(" | ORT-GPU comparison: ON"); +#endif + std::printf("\n\n"); + + std::printf("%-30s %18s %16s\n", "Model", "Avg (ms)", "Throughput (inf/s)"); + std::printf("%s\n", std::string(70, '-').c_str()); + + // One Benchmark_() call per discovered model, + // immediately followed by the optional BenchmarkORT_GPU() call. +@BENCHMARK_RUN_CALLS@ + + return 0; +} diff --git a/benchmark/src/BenchmarkUtils.hxx b/benchmark/src/BenchmarkUtils.hxx new file mode 100644 index 0000000..26a4546 --- /dev/null +++ b/benchmark/src/BenchmarkUtils.hxx @@ -0,0 +1,168 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sofie_bench { + +struct BenchmarkConfig { + int warmupIter = 10; + int benchIter = 100; + int deviceId = 0; + float tolerance = 1e-3f; + bool validateOrt = false; + std::string weightsDir = "."; + bool csvOutput = false; + bool verbose = false; +}; + +struct BenchmarkResult { + std::string modelName; + size_t inputElements = 0; + size_t outputElements = 0; + float avgInferMs = 0.0f; // per-inference average (chrono) + float throughput = 0.0f; // inferences / second + float weightMemMB = 0.0f; // device memory for model weights (if measurable) + float runtimeMemMB = 0.0f; // device memory for intermediates + bool ortRan = false; + bool ortMatch = false; + float ortMaxDiff = -1.0f; + bool skipped = false; + std::string skipReason; +}; + +inline BenchmarkConfig ParseArgs(int argc, char *argv[]) { + BenchmarkConfig cfg; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if ((arg == "--warmup" || arg == "-w") && i + 1 < argc) + cfg.warmupIter = std::stoi(argv[++i]); + else if ((arg == "--iterations" || arg == "-n") && i + 1 < argc) + cfg.benchIter = std::stoi(argv[++i]); + else if ((arg == "--device" || arg == "-d") && i + 1 < argc) + cfg.deviceId = std::stoi(argv[++i]); + else if ((arg == "--tolerance" || arg == "-t") && i + 1 < argc) + cfg.tolerance = std::stof(argv[++i]); + else if (arg == "--validate-ort") + cfg.validateOrt = true; + else if ((arg == "--weights-dir") && i + 1 < argc) + cfg.weightsDir = argv[++i]; + else if (arg == "--csv") + cfg.csvOutput = true; + else if (arg == "--verbose" || arg == "-v") + cfg.verbose = true; + else if (arg == "--help" || arg == "-h") { + std::cout << "SOFIE Alpaka Benchmark\n\n" + << "Options:\n" + << " --warmup, -w Warmup iterations (default: 10)\n" + << " --iterations, -n Benchmark iterations (default: 100)\n" + << " --device, -d Device index (default: 0)\n" + << " --tolerance, -t ONNXRuntime diff tolerance (default: 1e-3)\n" + << " --validate-ort Compare SOFIE outputs to ONNXRuntime\n" + << " --weights-dir Directory containing .dat weight files (default: .)\n" + << " --csv Print results in CSV format\n" + << " --verbose, -v Verbose output\n"; + std::exit(0); + } + } + return cfg; +} + +// Print device name (caller obtains it via alpaka::getName(dev)) +inline void PrintDeviceInfo(const std::string &deviceName) { + std::cout << "Device: " << deviceName << "\n"; +} + +inline void PrintHeader(const BenchmarkConfig &cfg, const std::string &deviceName = "") { + std::cout << "\n=== SOFIE Alpaka Benchmark ===\n"; + if (!deviceName.empty()) + PrintDeviceInfo(deviceName); + std::cout << "Warmup: " << cfg.warmupIter + << " | Iterations: " << cfg.benchIter; + if (cfg.validateOrt) + std::cout << " | ONNXRuntime validation ON (tol=" << cfg.tolerance << ")"; + std::cout << "\n\n"; + + if (cfg.csvOutput) { + std::cout << "Model,InputElems,OutputElems,AvgInferMs,Throughput(inf/s)," + "WeightMem(MB),RuntimeMem(MB),OrtMatch,OrtMaxDiff\n"; + } else { + std::cout << std::left + << std::setw(30) << "Model" + << std::setw(12) << "Input" + << std::setw(12) << "Output" + << std::setw(14) << "Avg(ms)" + << std::setw(16) << "Throughput(i/s)" + << std::setw(12) << "ORT Check" + << "\n"; + std::cout << std::string(96, '-') << "\n"; + } +} + +inline void PrintResult(const BenchmarkResult &r, const BenchmarkConfig &cfg) { + if (r.skipped) { + if (!cfg.csvOutput) + std::cout << std::left << std::setw(30) << r.modelName + << " [SKIPPED: " << r.skipReason << "]\n"; + return; + } + + if (cfg.csvOutput) { + std::cout << r.modelName << "," + << r.inputElements << "," + << r.outputElements << "," + << std::fixed << std::setprecision(4) << r.avgInferMs << "," + << std::fixed << std::setprecision(1) << r.throughput << "," + << std::fixed << std::setprecision(2) << r.weightMemMB << "," + << std::fixed << std::setprecision(2) << r.runtimeMemMB << ","; + if (r.ortRan) + std::cout << (r.ortMatch ? "PASS" : "FAIL") << "," << r.ortMaxDiff; + else + std::cout << "N/A,N/A"; + std::cout << "\n"; + } else { + std::string ortStr = "N/A"; + if (r.ortRan) { + std::ostringstream oss; + oss << (r.ortMatch ? "PASS" : "FAIL") + << "(d=" << std::scientific << std::setprecision(1) << r.ortMaxDiff << ")"; + ortStr = oss.str(); + } + std::cout << std::left + << std::setw(30) << r.modelName + << std::setw(12) << r.inputElements + << std::setw(12) << r.outputElements + << std::setw(14) << std::fixed << std::setprecision(4) << r.avgInferMs + << std::setw(16) << std::fixed << std::setprecision(1) << r.throughput + << std::setw(12) << ortStr + << "\n"; + } +} + +inline void PrintSummary(const std::vector &results, const BenchmarkConfig &cfg) { + if (cfg.csvOutput) return; + + std::cout << "\n" << std::string(96, '=') << "\n"; + int ran = 0, skipped = 0, ortFail = 0; + float totalMs = 0.0f; + for (const auto &r : results) { + if (r.skipped) { ++skipped; continue; } + ++ran; + totalMs += r.avgInferMs; + if (r.ortRan && !r.ortMatch) ++ortFail; + } + std::cout << "Summary: " << ran << " model(s) benchmarked"; + if (skipped) std::cout << ", " << skipped << " skipped"; + if (ran > 0) std::cout << ", avg inference " << std::fixed << std::setprecision(4) << (totalMs / ran) << " ms"; + if (ortFail) std::cout << ", " << ortFail << " ORT mismatch(es)"; + std::cout << "\n"; +} + +} // namespace sofie_bench diff --git a/benchmark/src/ONNXRuntimeBenchmark.hxx b/benchmark/src/ONNXRuntimeBenchmark.hxx new file mode 100644 index 0000000..4831c02 --- /dev/null +++ b/benchmark/src/ONNXRuntimeBenchmark.hxx @@ -0,0 +1,231 @@ +// SOFIE Benchmark — ONNX Runtime GPU backend +// Generic benchmark: loads any ONNX model, introspects shapes, runs with the +// CUDA ExecutionProvider. Float inputs are filled with uniform random values; +// integer inputs are zeroed (safe for index tensors like edge_index). +// +// Data stays on the HOST side of the ORT API (ORT handles H↔D transfers +// internally) — this measures end-to-end latency from the application's +// perspective. Use the optional IOBinding path (--ort-device-io, WIP) to +// measure pure GPU compute time comparable to the SOFIE numbers. +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// ── helpers ────────────────────────────────────────────────────────────────── + +namespace sofie_ort_bench_detail { + +/// Total element count from a shape vector (-1 dynamic dims are treated as 1). +inline std::size_t shapeToSize(const std::vector& shape) { + std::size_t n = 1; + for (auto d : shape) n *= (d > 0 ? static_cast(d) : 1u); + return n; +} + +/// Human-readable ORT element-type name. +inline const char* ortTypeName(ONNXTensorElementDataType t) { + switch (t) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: return "float32"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: return "float64"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: return "int32"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: return "int64"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: return "uint8"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: return "bool"; + default: return "other"; + } +} + +} // namespace sofie_ort_bench_detail + +// ── main benchmark function ─────────────────────────────────────────────────── + +/// Run @p model_path through ONNX Runtime's CUDAExecutionProvider. +/// Results are printed in the same table format as the SOFIE Alpaka benchmark. +/// +/// @param model_path Full path to the .onnx file. +/// @param model_name Display name shown in the table (typically the stem). +/// @param warmup Number of warm-up iterations (not timed). +/// @param iterations Number of timed iterations. +/// @param device_id CUDA device index (default 0). +/// @param verbose If true, print per-input shape/type information. +inline void BenchmarkORT_GPU(const std::string& model_path, + const std::string& model_name, + int warmup, + int iterations, + int device_id = 0, + bool verbose = false) +{ + using namespace sofie_ort_bench_detail; + + // ── ORT session setup ──────────────────────────────────────────────────── + Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "sofie_ort_bench"); + + Ort::SessionOptions opts; + opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + opts.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + + OrtCUDAProviderOptions cuda_opts{}; + cuda_opts.device_id = device_id; + cuda_opts.arena_extend_strategy = 0; // kNextPowerOfTwo + cuda_opts.gpu_mem_limit = SIZE_MAX; + cuda_opts.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive; + cuda_opts.do_copy_in_default_stream = 1; + opts.AppendExecutionProvider_CUDA(cuda_opts); + + Ort::Session session(env, model_path.c_str(), opts); + Ort::AllocatorWithDefaultOptions alloc; + Ort::MemoryInfo mem_cpu = + Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + + // ── introspect inputs ───────────────────────────────────────────────────── + const std::size_t num_inputs = session.GetInputCount(); + + std::vector input_names_str(num_inputs); + std::vector input_names_ptr(num_inputs); + std::vector> input_shapes(num_inputs); + std::vector input_types(num_inputs); + + // backing storage — one allocation per input + std::vector> float_data(num_inputs); + std::vector> double_data(num_inputs); + std::vector> int64_data(num_inputs); + std::vector> int32_data(num_inputs); + std::vector> uint8_data(num_inputs); + // Note: bool_data uses uint8_t storage; pointer is cast to bool* for CreateTensor + // (sizeof(bool)==sizeof(uint8_t)==1 on all supported platforms) + + std::mt19937 rng(42); + std::uniform_real_distribution fdist(-1.f, 1.f); + + std::vector input_tensors; + input_tensors.reserve(num_inputs); + + for (std::size_t i = 0; i < num_inputs; ++i) { + // name + auto name_ptr = session.GetInputNameAllocated(i, alloc); + input_names_str[i] = name_ptr.get(); + input_names_ptr[i] = input_names_str[i].c_str(); + + // type + shape + auto info = session.GetInputTypeInfo(i); + auto tinfo = info.GetTensorTypeAndShapeInfo(); + input_types[i] = tinfo.GetElementType(); + input_shapes[i] = tinfo.GetShape(); + + // replace dynamic dims (-1) with 1 for benchmarking + for (auto& d : input_shapes[i]) if (d < 0) d = 1; + + std::size_t n = shapeToSize(input_shapes[i]); + + if (verbose) { + std::printf(" Input %-2zu %-20s type=%-8s numel=%zu\n", + i, input_names_str[i].c_str(), + ortTypeName(input_types[i]), n); + } + + // fill data and create OrtValue + switch (input_types[i]) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: { + float_data[i].resize(n); + for (auto& v : float_data[i]) v = fdist(rng); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, float_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: { + double_data[i].resize(n, 0.0); + for (auto& v : double_data[i]) + v = static_cast(fdist(rng)); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, double_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { + // Zero: safe for index tensors (edge_index, etc.) + int64_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, int64_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { + int32_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, int32_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: { + uint8_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, uint8_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: { + // ORT requires bool* — use uint8_t backing (1 byte each, same size) + uint8_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, + reinterpret_cast(uint8_data[i].data()), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + default: + throw std::runtime_error( + std::string("BenchmarkORT_GPU: unsupported input type for ") + + input_names_str[i]); + } + } + + // ── output names ───────────────────────────────────────────────────────── + const std::size_t num_outputs = session.GetOutputCount(); + std::vector output_names_str(num_outputs); + std::vector output_names_ptr(num_outputs); + for (std::size_t i = 0; i < num_outputs; ++i) { + auto ptr = session.GetOutputNameAllocated(i, alloc); + output_names_str[i] = ptr.get(); + output_names_ptr[i] = output_names_str[i].c_str(); + } + + // build run-options that disable CPU fallback for a pure GPU measurement + Ort::RunOptions run_opts; + + // ── warm-up ────────────────────────────────────────────────────────────── + for (int w = 0; w < warmup; ++w) { + session.Run(run_opts, + input_names_ptr.data(), input_tensors.data(), num_inputs, + output_names_ptr.data(), num_outputs); + } + cudaDeviceSynchronize(); + + // ── timed run ───────────────────────────────────────────────────────────── + auto t0 = std::chrono::high_resolution_clock::now(); + for (int it = 0; it < iterations; ++it) { + session.Run(run_opts, + input_names_ptr.data(), input_tensors.data(), num_inputs, + output_names_ptr.data(), num_outputs); + } + cudaDeviceSynchronize(); + auto t1 = std::chrono::high_resolution_clock::now(); + + double avg_ms = std::chrono::duration(t1 - t0).count() + / iterations; + double throughput = (avg_ms > 0.0) ? 1000.0 / avg_ms : 0.0; + + // Print in the same table format, with "[ORT]" tag in the model column + std::string label = std::string(model_name) + " [ORT-GPU]"; + std::printf("%-30s avg %8.4f ms (%8.1f inf/s)\n", + label.c_str(), avg_ms, throughput); +} diff --git a/check_style.sh b/check_style.sh new file mode 100644 index 0000000..22a56e4 --- /dev/null +++ b/check_style.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +# Directories +SRC_DIR="./include" +TEST_DIR="./tests" + +echo "📝 Discovering source/header files..." + +FILES=$(find "$SRC_DIR" "$TEST_DIR" \ + -path "$TEST_DIR/build" -prune -o \ + -type f \( \ + -name '*.cpp' -o -name '*.cc' -o -name '*.cxx' -o \ + -name '*.h' -o -name '*.hpp' -o -name '*.hxx' -o -name '*.hh' \ + \) -print) + +if [ -z "$FILES" ]; then + echo "⚠️ No files found to process." + exit 0 +fi + +echo "🎯 Files to check:" +echo "$FILES" + +echo "🎨 Running clang-format..." +for file in $FILES; do + echo "Formatting $file" + clang-format -i "$file" +done + +echo "🔍 Running clang-tidy..." +for file in $FILES; do + echo "Linting $file" + clang-tidy "$file" --extra-arg=-std=c++20 -- -I"$SRC_DIR" || true +done + +echo "✅ Formatting and linting complete." diff --git a/cmake/SOFIEConfig.cmake.in b/cmake/SOFIEConfig.cmake.in new file mode 100644 index 0000000..94ebc4a --- /dev/null +++ b/cmake/SOFIEConfig.cmake.in @@ -0,0 +1,13 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +find_dependency(Protobuf) + +if(@SOFIE_WITH_ROOT@) + find_dependency(ROOT COMPONENTS Core TMVA Tree) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/SOFIETargets.cmake") + +check_required_components(SOFIE) diff --git a/cmake/modules/SofieTestMacros.cmake b/cmake/modules/SofieTestMacros.cmake new file mode 100644 index 0000000..1f4d235 --- /dev/null +++ b/cmake/modules/SofieTestMacros.cmake @@ -0,0 +1,73 @@ +# Fallback test macros used when ROOT is not available. +# These provide the same interface as ROOTTEST_GENERATE_EXECUTABLE and +# ROOTTEST_ADD_TEST from RoottestMacros.cmake but without requiring ROOT. + +macro(ROOTTEST_GENERATE_EXECUTABLE executable) + cmake_parse_arguments(ARG "" "RESOURCE_LOCK" + "LIBRARIES;COMPILE_FLAGS;DEPENDS;FIXTURES_SETUP;FIXTURES_CLEANUP;FIXTURES_REQUIRED" + ${ARGN}) + + add_executable(${executable} EXCLUDE_FROM_ALL ${ARG_UNPARSED_ARGUMENTS}) + set_target_properties(${executable} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + if(ARG_DEPENDS) + add_dependencies(${executable} ${ARG_DEPENDS}) + endif() + + if(ARG_LIBRARIES) + target_link_libraries(${executable} ${ARG_LIBRARIES}) + endif() + + if(ARG_COMPILE_FLAGS) + set_target_properties(${executable} PROPERTIES COMPILE_FLAGS ${ARG_COMPILE_FLAGS}) + endif() + + set(_sofie_build_test ${executable}-build) + add_test(NAME ${_sofie_build_test} + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target ${executable}) + + if(ARG_FIXTURES_SETUP) + set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_SETUP ${ARG_FIXTURES_SETUP}) + endif() + if(ARG_FIXTURES_CLEANUP) + set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_CLEANUP ${ARG_FIXTURES_CLEANUP}) + endif() + if(ARG_FIXTURES_REQUIRED) + set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_REQUIRED ${ARG_FIXTURES_REQUIRED}) + endif() +endmacro() + +function(ROOTTEST_ADD_TEST testname) + cmake_parse_arguments(ARG "" + "WORKING_DIR;TIMEOUT;RESOURCE_LOCK" + "EXEC;COMMAND;DEPENDS;FIXTURES_SETUP;FIXTURES_CLEANUP;FIXTURES_REQUIRED;ENVIRONMENT;PROPERTIES" + ${ARGN}) + + if(ARG_EXEC) + set(_cmd ${ARG_EXEC}) + elseif(ARG_COMMAND) + set(_cmd ${ARG_COMMAND}) + else() + message(FATAL_ERROR "ROOTTEST_ADD_TEST: must specify EXEC or COMMAND") + endif() + + add_test(NAME ${testname} COMMAND ${_cmd} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + if(ARG_FIXTURES_SETUP) + set_property(TEST ${testname} PROPERTY FIXTURES_SETUP ${ARG_FIXTURES_SETUP}) + endif() + if(ARG_FIXTURES_CLEANUP) + set_property(TEST ${testname} PROPERTY FIXTURES_CLEANUP ${ARG_FIXTURES_CLEANUP}) + endif() + if(ARG_FIXTURES_REQUIRED) + set_property(TEST ${testname} PROPERTY FIXTURES_REQUIRED ${ARG_FIXTURES_REQUIRED}) + endif() + if(ARG_ENVIRONMENT) + set_property(TEST ${testname} PROPERTY ENVIRONMENT ${ARG_ENVIRONMENT}) + endif() + if(ARG_TIMEOUT) + set_property(TEST ${testname} PROPERTY TIMEOUT ${ARG_TIMEOUT}) + endif() +endfunction() diff --git a/src/SOFIE_core/CMakeLists.txt b/core/CMakeLists.txt similarity index 74% rename from src/SOFIE_core/CMakeLists.txt rename to core/CMakeLists.txt index 7297957..36cf037 100644 --- a/src/SOFIE_core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -76,6 +76,7 @@ list(TRANSFORM sources_headers PREPEND "inc/") set(sources_cxx src/RModel_Base.cxx src/RModel.cxx + src/RModel_ALPAKA.cxx src/RModel_GNN.cxx src/RModel_GraphIndependent.cxx src/RFunction.cxx @@ -86,23 +87,35 @@ set(sources_cxx ) target_sources(SOFIE_core PRIVATE ${sources_headers} ${sources_cxx}) -target_include_directories(SOFIE_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc) -target_link_libraries(SOFIE_core PUBLIC - Tree - Core - RIO +target_include_directories(SOFIE_core PUBLIC + $ + $ ) +target_link_libraries(SOFIE_core PUBLIC utils) -ROOT_GENERATE_DICTIONARY(G__SOFIE ${sources_headers} - LINKDEF inc/LinkDef.h - MODULE SOFIE_core - OPTIONS --deep -) +if(SOFIE_WITH_ROOT AND ROOT_FOUND) + target_compile_definitions(SOFIE_core PUBLIC SOFIE_SUPPORT_ROOT_BINARY) + target_link_libraries(SOFIE_core PUBLIC Tree Core RIO) + + ROOT_GENERATE_DICTIONARY(G__SOFIE_core ${sources_headers} + LINKDEF inc/LinkDef.h + MODULE SOFIE_core + OPTIONS --deep + ) + + # Install the dictionaries. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core.rootmap + DESTINATION lib) +endif() install(TARGETS SOFIE_core - LIBRARY DESTINATION lib + EXPORT SOFIETargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} +) +install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) -install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" DESTINATION "include") if(testing) add_subdirectory(test) diff --git a/src/SOFIE_core/README.md b/core/README.md similarity index 96% rename from src/SOFIE_core/README.md rename to core/README.md index 033cad4..b0a50a1 100644 --- a/src/SOFIE_core/README.md +++ b/core/README.md @@ -12,10 +12,10 @@ This is a new development in TMVA and is currently in early experimental stage. ## Installation -Build ROOT with the cmake option tmva-sofie enabled. +Build ROOT with the cmake option sofie enabled. ```bash -cmake ../root -Dtmva-sofie=ON +cmake ../root -Dsofie=ON make -j8 ``` @@ -25,7 +25,6 @@ SOFIE works in a parser-generator working architecture. With SOFIE, the user get From ROOT command line, or in a ROOT macro, we can proceed with an ONNX model: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; SOFIE::RModel model = parser.Parse(“./example_model.onnx”); model.Generate(); @@ -73,7 +72,6 @@ SOFIE also supports generating inference code with RDataFrame as inputs, refer t Here is the updated list of supported ONNX operators. You can obtain this list by doing ```cpp -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; std::vector supportedOperators = parser.GetRegisteredOperators(); ``` @@ -164,7 +162,6 @@ The above operators are supported for tensors of the following types: You can also check your model whether all operators are implemented by doing the following: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; parser.CheckModel("example_model.ONNX"); ``` diff --git a/src/SOFIE_core/inc/LinkDef.h b/core/inc/LinkDef.h similarity index 100% rename from src/SOFIE_core/inc/LinkDef.h rename to core/inc/LinkDef.h diff --git a/src/SOFIE_core/inc/SOFIE/FunctionList.hxx b/core/inc/SOFIE/FunctionList.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/FunctionList.hxx rename to core/inc/SOFIE/FunctionList.hxx diff --git a/src/SOFIE_core/inc/SOFIE/OperatorList.hxx b/core/inc/SOFIE/OperatorList.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/OperatorList.hxx rename to core/inc/SOFIE/OperatorList.hxx diff --git a/src/SOFIE_core/inc/SOFIE/RFunction.hxx b/core/inc/SOFIE/RFunction.hxx similarity index 98% rename from src/SOFIE_core/inc/SOFIE/RFunction.hxx rename to core/inc/SOFIE/RFunction.hxx index 53c30e3..f79691a 100644 --- a/src/SOFIE_core/inc/SOFIE/RFunction.hxx +++ b/core/inc/SOFIE/RFunction.hxx @@ -3,6 +3,7 @@ #include "SOFIE/RModel_Base.hxx" #include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" #include #include diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx b/core/inc/SOFIE/RFunction_MLP.hxx similarity index 90% rename from src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx rename to core/inc/SOFIE/RFunction_MLP.hxx index 8dfc0e1..d9f8626 100644 --- a/src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx +++ b/core/inc/SOFIE/RFunction_MLP.hxx @@ -15,7 +15,7 @@ enum class Activation { class RFunction_MLP: public RFunction_Update { private: - Int_t fNumLayers; // Number of Layers in MLP + int_t fNumLayers; // Number of Layers in MLP Activation fActivationFunction; bool fActivateFinal; // if True, fActivationFunction is applied as the activation for the last layer std::vector fKernelTensors; @@ -23,7 +23,7 @@ private: public: virtual ~RFunction_MLP() {} - RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation activation_function=Activation::RELU, bool activate_final=false, GraphType gType=GraphType::GNN); + RFunction_MLP(FunctionTarget target, int_t numLayers, Activation activation_function=Activation::RELU, bool activate_final=false, GraphType gType=GraphType::GNN); void Initialize(); diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_Mean.hxx b/core/inc/SOFIE/RFunction_Mean.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/RFunction_Mean.hxx rename to core/inc/SOFIE/RFunction_Mean.hxx diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_Sum.hxx b/core/inc/SOFIE/RFunction_Sum.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/RFunction_Sum.hxx rename to core/inc/SOFIE/RFunction_Sum.hxx diff --git a/src/SOFIE_core/inc/SOFIE/RModel.hxx b/core/inc/SOFIE/RModel.hxx similarity index 61% rename from src/SOFIE_core/inc/SOFIE/RModel.hxx rename to core/inc/SOFIE/RModel.hxx index 79541af..103c3c5 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel.hxx +++ b/core/inc/SOFIE/RModel.hxx @@ -13,17 +13,26 @@ class RModel final : public RModel_Base { private: bool fIsInitialized = false; bool fIsSubGraph = false; + bool fUseVDT = false; int fVerbose = 0; int fBatchSize = -1; long fReadPos = 0; // reading file position + size_t fConstantTensorSize = 0; // size (in Bytes) of the allocated constant tensors + size_t fWeightsTensorSize = 0; // size (in Bytes) of the allocated weight tensors + size_t fOtherTensorSize = 0; // size (in Bytes) of intermediate tensors which are not managed by the memory pool + + OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended; std::unordered_map fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs? std::unordered_map fReadyInputTensorInfos; // input tensors where shape is full defined std::unordered_map fInitializedTensors; std::unordered_map fIntermediateTensorInfos; std::unordered_map fDynamicTensorInfos; + std::unordered_map, bool>> fShapeTensors; // constant tensors describing a shape + std::unordered_map fAliasTensors; // alias tensors (name -> original tensor name) std::unordered_map fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value + std::vector fDimShapeNames; // parameter names used to define the shapes std::vector fOutputTensorNames; std::vector fInputTensorNames; // input tensor names using ONNX order @@ -38,6 +47,26 @@ private: MemoryPoolInfo fIntermediateMemoryInfo; /// fIntermediateTensorFrequencyLookup; /// opIndices; ///< consecutive op indices forming this group + std::string inputTensor; ///< input tensor name of the first op + std::string outputTensor; ///< output tensor name of the last op + size_t numElements = 0; + bool isFused() const { return opIndices.size() > 1; } + std::string suffix() const { + std::string s; + for (auto i : opIndices) s += "_" + std::to_string(i); + return s; + } + }; + std::vector fEltwiseFusionGroups; /// fOpToFusionGroupIdx; /// fusion group index + std::set fFusionIntermediateTensors; /// &GetTensorShape(std::string name) const; - std::vector GetDynamicTensorShape(std::string name) const; - const ETensorType &GetTensorType(std::string name) const; + std::vector GetTensorShape(const std::string & name) const; + std::vector GetDimTensorShape(const std::string & name) const; + ETensorType GetTensorType(std::string name) const; + std::vector GetDynamicTensorShape(const std::string & name) const ; + + // get the values for the tensor representing a shape + const std::vector & GetShapeTensorValues(const std::string & tensor_name) const; + bool CheckIfTensorAlreadyExist(std::string tensor_name); void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape); @@ -81,6 +115,7 @@ public: size_t length = ConvertShapeToLength(shape); std::shared_ptr data_ptr(malloc(length * sizeof(T)), free); std::memcpy(data_ptr.get(), (void*) data, length * sizeof(T)); + std::cout<<"Length of constant tensor "<(T()), shape, data_ptr); } // for boolean can be more convenient passing an std::vector @@ -102,6 +137,12 @@ public: AddInitializedTensor(tensor_name, GetTemplatedType(T()), shape, data); } + void AddShapeTensor(const std::string & name, const std::vector & shapeValues, bool scalar = false); + void AddAliasTensor(const std::string & name, const std::string & origin); + bool IsAliasTensor(const std::string & tensor_name) const; + + void AddExtraCodeForDimShapes(const std::string & code) { fExtraCodeForDimShapes += code; } + // add and initialize subgraph to the model void InitializeSubGraph(std::shared_ptr graph); @@ -118,13 +159,15 @@ public: bool IsDimInputTensor(const std::string &name) const; // check if tensor is a fully specified input tensor bool IsReadyInputTensor(const std::string &name) const; + /// check if a tensor is a shape tensor + bool IsShapeTensor(const std::string & name) const; // Add intermediate tensor void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape); void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape); // Add an intermediate dynamic tensor void AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape); - + void AddShapeParam(const std::string & name, size_t def_value = 0); void AddInputTensorName(std::string name); void AddOutputTensorNameList(std::vector output_tensor_names); void @@ -132,6 +175,9 @@ public: void UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data); std::shared_ptr GetInitializedTensorData(std::string tensor_name); + void RemoveInitializedTensor(std::string tensor_name); + template + std::vector GetTensorData(const std::string & name); void Initialize(int batchSize = -1, bool verbose = false); void Initialize(const std::map & inputParams, bool verbose = false); @@ -141,40 +187,77 @@ public: { Generate(static_cast>(options), batchSize, pos, verbose); } + void GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize = -1, bool verbose = false); + void GenerateGPU_ALPAKA(Options options = Options::kDefault, int batchSize = -1, bool verbose = false) + { + GenerateGPU_ALPAKA(static_cast>(options), batchSize, verbose); + } // generate the infer function signature. If isdecl= false generate the calling infer function // used to infer the sub-graphs std::string GenerateInferSignature(bool isdecl = true); + // generate the infer function signature for inference on ALPAKA. If isdecl= false generate the calling infer function + // used to infer the sub-graphs + std::string GenerateInferSignature_GPU_ALPAKA(bool isdecl = true); + + // generate the _infer_impl signature using ViewPlainPtr types instead of Buf types + std::string GenerateImplSignature_GPU_ALPAKA(bool isdecl = true); + + void RemoveIntermediateTensor(const std::string& tensor_name){ + fIntermediateTensorInfos.erase(tensor_name); + } + // calculate total intermediate memory and position intermediate tensor addresses - std::string AllocateIntermediateMemory(std::span op_output_tensors); - void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); + std::string AllocateIntermediateMemory(std::span op_output_tensors); + void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); protected: // internal functions // generate code for the initialized tensors void GenerateInitializedTensorInfo(); + + void GenerateInitializedTensorInfo_GPU_ALPAKA(); // generate code for the intermediate tensors void GenerateIntermediateTensorInfo(); + + // generate code for the temporary initialized tensors containers + void GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + // generate code for the dynamic tensors void GenerateDynamicTensorInfo(); + + void GenerateDynamicTensorInfo_GPU_ALPAKA(); // generate code for declarations needed by operators void GenerateOperatorDeclarations(); // generate code for inference void GenerateOutput(); + + void GenerateOutput_GPU_ALPAKA(); + + void MoveInitializedTensorsToBuffers_ALPAKA(); // generate code for initializing memory pool for intermediate tensors void GenerateIntermediateMemoryPool(); // Generate all session code void GenerateSessionCode(); + void GenerateSessionCode_GPU_ALPAKA(); + void GenerateGPU_ALPAKA_Buffers(); + + void CheckAndFuseOperators(); + bool IsInputTensorShapeParam(std::string const ¶mName) const; + std::vector CollectTensorMemberNames(const std::string &input); + void GenerateRequiredInputTensorInfo(); public: const std::vector &GetInputTensorNames() const { return fInputTensorNames; } const std::vector &GetOutputTensorNames() const { return fOutputTensorNames; } + const std::vector & GetDimShapeNames() const { return fDimShapeNames; } void ReadInitializedTensorsFromFile(long); long WriteInitializedTensorsToFile(std::string filename = ""); - void PrintIntermediateTensors(); - void PrintOutputTensors(); + void PrintIntermediateTensors() const; + void PrintOutputTensors() const; + void PrintSummary() const; void OutputGenerated(std::string filename = "", bool append = false); std::vector GetOutputTensorNames() { return fOutputTensorNames; } void SetFilename(std::string filename) { fName = filename; } @@ -185,24 +268,46 @@ public: //a view only T obj; if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()){ - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); + throw std::runtime_error("sofie: initialized tensor with name " + tensor_name + " already exists \n"); } InitializedTensor new_tensor_ {GetTemplatedType(obj), new_tensor.GetShape() , static_cast(new_tensor.GetData())}; fInitializedTensors[tensor_name] = new_tensor_; } */ - void PrintRequiredInputTensors(); - void PrintInitializedTensors(); - void PrintDynamicTensors(); + void PrintRequiredInputTensors() const; + void PrintInitializedTensors() const; + void PrintDynamicTensors() const; void HeadInitializedTensors(std::string name, int n_print = 50); bool UseSession() const { return fUseSession; } - + void SetUseVDT(bool on) { + fUseVDT = on; + } + bool UseVDT() const { return fUseVDT;} + +#ifdef SOFIE_SUPPORT_ROOT_BINARY // Use the ClassDef macro to allow definition of custom streaming ClassDefNV(RModel, 3); +#endif + }; +template +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsInitializedTensor(name)) return std::vector{}; + T * data = static_cast(GetInitializedTensorData(name).get()); + size_t size = ConvertShapeToLength(GetTensorShape(name)); + return std::vector(data, data+size); +} + +template<> +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsShapeTensor(name)) return std::vector{}; + return GetShapeTensorValues(name); +} + + } // namespace SOFIE #endif // SOFIE_RMODEL diff --git a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx b/core/inc/SOFIE/RModel_Base.hxx similarity index 54% rename from src/SOFIE_core/inc/SOFIE/RModel_Base.hxx rename to core/inc/SOFIE/RModel_Base.hxx index f8a9d34..d4f9afa 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx +++ b/core/inc/SOFIE/RModel_Base.hxx @@ -12,8 +12,10 @@ #include #include #include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" + +#ifdef SOFIE_SUPPORT_ROOT_BINARY #include "TBuffer.h" +#endif namespace SOFIE { @@ -27,10 +29,26 @@ enum class Options { kGNNComponent = 0x10, }; +// Optimization levels inspired by ONNXRuntime. +// We only get Operator Fusion with the Basic, and +// memory reuse with Extended. kExtended is enabled +// by default +enum class OptimizationLevel { + kBasic = 0x0, + kExtended = 0x1, +}; + enum class WeightFileType { None, RootBinary, Text }; -std::underlying_type_t operator|(Options opA, Options opB); -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB); + +inline std::underlying_type_t operator|(Options opA, Options opB) { + return static_cast>(opA) | + static_cast>(opB); +} + +inline std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { + return opA | static_cast>(opB); +} class RModel_Base { @@ -53,6 +71,45 @@ protected: bool fIsGNN = false; bool fIsGNNComponent = false; + // Function to generate the code for declaring and initializing constant tensors + // This is for tensors which are not part of weight files and can be created from the Constant operator + template + std::string GenerateConstantTensorCode(const std::pair &t) + { + std::stringstream strs; + std::string type = ConvertTypeToString(t.second.type()); + size_t length = ConvertShapeToLength(t.second.shape()); + std::cout<<"Constant tensor name: "< 100) ? false : true; + + const T *data = t.second.data(); + + // and check if all values are the same + bool sameData = false; + // for non stack allocation check if data are the same + if (!allocateOnStack && length > 1) { + size_t idx = 1; + std::cout<<"insider allocate on stack and length\n"; + do { + sameData = (data[idx] == data[idx - 1]); + idx++; + } while (sameData && idx < length); + } + if (allocateOnStack) { + strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; + } else { + strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; + if (sameData) + strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; + else { + strs << ConvertValuesToString(length, data) << ";\n"; + } + strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; + } + return strs.str(); + } + public: /** Default constructor. Needed to allow serialization of ROOT objects. See @@ -73,15 +130,15 @@ public: } void AddNeededStdLib(std::string libname) { - if (fAllowedStdLib.find(libname) != fAllowedStdLib.end()) { - fNeededStdLib.insert(libname); - } + // if the library is already in the set, insert does nothing + fNeededStdLib.insert(libname); } void AddNeededCustomHeader(std::string filename) { fCustomOpHeaders.insert(filename); } void GenerateHeaderInfo(std::string &hgname); + void GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname); void PrintGenerated() { std::cout << fGC; } std::string ReturnGenerated() { return fGC; } diff --git a/src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx b/core/inc/SOFIE/RModel_GNN.hxx similarity index 94% rename from src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx rename to core/inc/SOFIE/RModel_GNN.hxx index 558f82c..93bb092 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx +++ b/core/inc/SOFIE/RModel_GNN.hxx @@ -66,7 +66,7 @@ struct GNN_Init { break; } default: { - throw std::runtime_error("TMVA SOFIE: Invalid Update function supplied for creating GNN function block."); + throw std::runtime_error("SOFIE: Invalid Update function supplied for creating GNN function block."); } } } @@ -88,7 +88,7 @@ struct GNN_Init { break; } default: { - throw std::runtime_error("TMVA SOFIE: Invalid Aggregate function supplied for creating GNN function block."); + throw std::runtime_error("SOFIE: Invalid Aggregate function supplied for creating GNN function block."); } } } diff --git a/src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx b/core/inc/SOFIE/RModel_GraphIndependent.hxx similarity index 96% rename from src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx rename to core/inc/SOFIE/RModel_GraphIndependent.hxx index 407c645..dfade7f 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx +++ b/core/inc/SOFIE/RModel_GraphIndependent.hxx @@ -49,7 +49,7 @@ struct GraphIndependent_Init { } default: { throw std::runtime_error( - "TMVA SOFIE: Invalid Update function supplied for creating GraphIndependent function block."); + "SOFIE: Invalid Update function supplied for creating GraphIndependent function block."); } } } diff --git a/core/inc/SOFIE/ROperator.hxx b/core/inc/SOFIE/ROperator.hxx new file mode 100644 index 0000000..20cdf6e --- /dev/null +++ b/core/inc/SOFIE/ROperator.hxx @@ -0,0 +1,128 @@ +#ifndef SOFIE_ROPERATOR +#define SOFIE_ROPERATOR + +#include +#include +#include + +#include "SOFIE/SOFIE_common.hxx" + + +namespace SOFIE{ + +class RModel; + +enum class OperatorKind { + GEMM = 0, + LAYERNORM = 1, + RELU = 2, + CONSTANT = 3, + CONSTANTOFSHAPE = 4, + UNDEFINED = 5, + CONV=6, + BATCHNORM=7, + CAST=8, + COMPARISON=9, + EINSUM=10, + ELU=11, + SIGMOID=12, + TANH=13, + SOFTMAX=14, + LEAKYRELU=15, + UNARY_RECIPROCAL=16, + UNARY_SQRT=17, + UNARY_NEG=18, + UNARY_EXP=19, + UNARY_LOG=20, + UNARY_SIN=21, + UNARY_COS=22, + UNARY_ABS=23 +}; + +inline const char* toString(OperatorKind kind) { + switch (kind) { + case OperatorKind::GEMM: return "GEMM"; + case OperatorKind::LAYERNORM: return "LAYERNORM"; + case OperatorKind::RELU: return "RELU"; + case OperatorKind::CONSTANT: return "CONSTANT"; + case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE"; + case OperatorKind::BATCHNORM: return "BATCHNORM"; + case OperatorKind::CONV: return "CONV"; + case OperatorKind::UNDEFINED: return "UNDEFINED"; + default: return "UNKNOWN"; + } +} + +inline std::set FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM, OperatorKind::BATCHNORM}; + +class ROperator{ + + +public: + virtual std::vector GetBlasRoutines() { return {}; } + virtual std::vector GetStdLibs() { return {}; } + virtual std::vector> ShapeInference(std::vector>) { return {}; }; + virtual std::vector TypeInference(std::vector) { return {}; }; + virtual void Initialize(RModel&) = 0; + virtual std::string Generate(std::string OpName) = 0; //expect unique opName for each operator within the same RModel + virtual std::string Generate_GPU_ALPAKA(std::string OpName){ return "";} //expect unique opName for each operator within the same RModel + // generate initialization code for session constructor + virtual std::string GenerateInitCode() { return "";} + virtual std::string GenerateInitCode_GPU_ALPAKA() { return "";}; + // generate some specific declaration code for Session + virtual std::string GenerateDeclCode() { return "";} + // generate session data members specific to operator + virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { return ""; } + virtual std::string Header() { return "";} + virtual std::string GetFusableOutputTensorName() { return "";} + virtual std::string GetBlasConfig() { return ""; } + virtual void UpdateFusableTensorName(std::string, const std::function& removal_func){ return;}; + + // Elementwise kernel fusion interface + virtual bool IsElementwise() const { return false; } + // Returns the C++ expression applying this op to inputVar (a local T variable) for fused kernel generation + virtual std::string GetElementwiseExpr(const std::string& /*inputVar*/) const { return ""; } + + //virtual void Forward_reference() = 0; + //virtual void Forward_blas() = 0; + virtual ~ROperator(){} + +protected: + OperatorKind fKind = OperatorKind::UNDEFINED; + size_t fOpOrder = 0; + const std::string SP = " "; ///< space used to correctly indent the generated C++ code + bool fUseSession = false; ///< flag to identify if using the session class + bool fIsOutputConstant = false; ///< flag to identify if operator has a constant output (no need to generate code) + bool fIsOutputParamShape = false; ///< flag to identify of the output represents a parametric shape (can be known at compile time) + + mutable std::vector fInputTensorNames; + mutable std::vector fOutputTensorNames; + +public: + std::span GetOpInputTensors() const { + return fInputTensorNames; + } + + std::span GetOpOutputTensors() const { + return fOutputTensorNames; + } + + OperatorKind GetKind() const { return fKind; } + bool IsOutputConstant() const { return fIsOutputConstant; } + + void RegisterOperatorOrder(const size_t ord){ + fOpOrder = ord; + } + size_t GetOpOrder(){ + return fOpOrder; + } + +}; + + + +}//SOFIE + +#endif //SOFIE_OPERATOR diff --git a/core/inc/SOFIE/ROperator_BasicBinary.hxx b/core/inc/SOFIE/ROperator_BasicBinary.hxx new file mode 100644 index 0000000..a40f6b8 --- /dev/null +++ b/core/inc/SOFIE/ROperator_BasicBinary.hxx @@ -0,0 +1,571 @@ +#ifndef SOFIE_SOFIE_ROperator_BasicBinary +#define SOFIE_SOFIE_ROperator_BasicBinary + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE { + +enum EBasicBinaryOperator { + Add, + Sub, + Mul, + Div, + Pow, + Mod, + FMod +}; + +template +struct BinaryOperatorTrait {}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Add"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " + " + t2; } + static T Func(T t1, T t2) { return t1 + t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Sub"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " - " + t2; } + static T Func(T t1, T t2) { return t1 - t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Mul"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " * " + t2; } + static T Func(T t1, T t2) { return t1 * t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Div"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " / " + t2; } + static T Func(T t1, T t2) { return t1 / t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Pow"; } + static std::string Op(const std::string &t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } + static T Func(T t1, T t2) { return std::pow(t1, t2); } +}; +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Mod"; } + static std::string Op(const std::string & t1, const std::string t2) { return "(" + t1 + " % " + t2 + ")"; } + static T Func(T t1, T t2) { return t1 % t2; } +}; +template +struct BinaryOperatorTrait { + static const std::string Name() { return "FMod"; } + static std::string Op(const std::string & t1, const std::string t2) { return "std::fmod(" + t1 + "," + t2 + ")"; } + static T Func(T t1, T t2) { return std::fmod(t1, t2); } +}; + +template +class ROperator_BasicBinary final : public ROperator { +private: + int fBroadcastFlag = 0; + std::string fNA; + std::string fNB; + std::string fNBroadcastedA; + std::string fNBroadcastedB; + std::string fNY; + + std::vector fShapeA; + std::vector fShapeB; + std::vector fShapeY; + + std::vector fDimShapeA; + std::vector fDimShapeB; + std::vector fDimShapeY; + +public: + ROperator_BasicBinary() {} + ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY) + : fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = {fNA, fNB}; + fOutputTensorNames = {fNY}; + } + + // type of output given input + std::vector TypeInference(std::vector input) override { return input; } + + // shape of output tensors given input tensors + std::vector> ShapeInference(std::vector> input) override + { + // assume now inputs have same shape (no broadcasting) + auto ret = std::vector>(1, input[0]); // return vector size 1 with first input + return ret; + } + + void Initialize(RModel &model) override + { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNA)) { + throw std::runtime_error(std::string("SOFIE Binary Op Input Tensor ") + fNA + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNB)) { + throw std::runtime_error(std::string("SOFIE Binary Op Input Tensor ") + fNB + "is not found in model"); + } + int dynamicInputs = 0; + if (model.IsDynamicTensor(fNA)) { + fDimShapeA = model.GetDynamicTensorShape(fNA); + dynamicInputs |= 1; + } else { + fShapeA = model.GetTensorShape(fNA); + fDimShapeA = ConvertShapeToDim(fShapeA); + } + if (model.IsDynamicTensor(fNB)) { + dynamicInputs |= 2; + fDimShapeB = model.GetDynamicTensorShape(fNB); + } else { + fShapeB = model.GetTensorShape(fNB); + fDimShapeB = ConvertShapeToDim(fShapeB); + } + if (dynamicInputs & 1 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNA << " is dynamic " + << ConvertDimShapeToString(fDimShapeA) << std::endl; + if (dynamicInputs & 2 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNB << " is dynamic " + << ConvertDimShapeToString(fDimShapeB) << std::endl; + + // check if need to broadcast at initialization time if shapes are known and different + // (we could broadcast the tensor tensor to maximum values of dynamic shapes - to be done) + // case of known shapes + // if shapes are known find the output shape from broadcasting + if (dynamicInputs == 0) { + auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeA, fShapeB); + fBroadcastFlag = ret.first; + fShapeY = ret.second; + auto lengthY = ConvertShapeToLength(fShapeY); + if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) { + bool broadcast = fBroadcastFlag > 0; + if (broadcast) { + // Y is the common shape of A and B + bool broadcastA = fBroadcastFlag & 2; + bool broadcastB = fBroadcastFlag & 1; + // Broadcast A to Y + if (broadcastA) { + fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; + auto data = model.GetInitializedTensorData(fNA); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), + std::default_delete()); + if (model.Verbose()) + std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + // Update the data and the shape of A + model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); + fShapeA = fShapeY; + fDimShapeA = ConvertShapeToDim(fShapeA); + } + // Broadcast B to Y + if (broadcastB) { + fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; + auto data = model.GetInitializedTensorData(fNB); + if (model.Verbose()) + std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) + << std::endl; + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), + std::default_delete()); + // do not update tensor B but add broadcasted one (since it can be input to some other operators) + if (model.Verbose()) + std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); + fShapeB = fShapeY; + fDimShapeB = ConvertShapeToDim(fShapeB); + } + } else { + fShapeY = fShapeA; + } + // tensors are constant: perform here the binary operation + + const std::string &nameA = fNBroadcastedA.empty() ? fNA : fNBroadcastedA; + const std::string &nameB = fNBroadcastedB.empty() ? fNB : fNBroadcastedB; + auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); + auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); + std::vector dataY(lengthY); + for (size_t i = 0; i < dataY.size(); i++) { + dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); + } + model.AddConstantTensor(fNY, fShapeY, dataY.data()); + // flag tensors to not be written in the generated code or weight file + model.SetNotWritableInitializedTensor(nameA); + model.SetNotWritableInitializedTensor(nameB); + fIsOutputConstant = true; + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(dataY) << std::endl; + } + } else if (((model.IsShapeTensor(fNA) && model.IsShapeTensor(fNB)) || + (model.IsShapeTensor(fNA) && model.IsInitializedTensor(fNB)) || + (model.IsShapeTensor(fNB) && model.IsInitializedTensor(fNA))) + && (fShapeA.size() <=1 && fShapeB.size() <=1 && model.GetTensorType(fNA) == ETensorType::INT64)) { + // case of shape tensors ( tensors are of rank 0 or 1 ) + std::vector dimValA; + std::vector dimValB; + if (model.IsShapeTensor(fNA)) + dimValA = model.GetShapeTensorValues(fNA); + if (model.IsShapeTensor(fNB)) + dimValB = model.GetShapeTensorValues(fNB); + // adjust for broadcasting - repet values until it reaches shapes of Y + if (!fShapeY.empty() && fShapeY[0] > 1) { + if (dimValA.size() == 1) dimValA = std::vector( fShapeY[0], dimValA[0]); + if (dimValB.size() == 1) dimValB = std::vector( fShapeY[0], dimValB[0]); + } + + auto convertDataToDim = [&](const std::string & name, const std::vector & shape, std::vector & dimValues) { + auto data = static_cast(model.GetInitializedTensorData(name).get()); + dimValues.resize(lengthY); + for (size_t i = 0; i < lengthY; i++) { + if (!shape.empty() && lengthY == shape[0]) + dimValues[i] = Dim{ static_cast(data[i])}; + else // case dataA is a scalar + dimValues[i] = Dim{ static_cast(data[0])}; + } + }; + if (model.IsInitializedTensor(fNA)) { + convertDataToDim(fNA,fShapeA,dimValA); + } else if (model.IsInitializedTensor(fNB)) { + convertDataToDim(fNB,fShapeB,dimValB); + } + + //perform binary operations on shape tensors + std::vector dimValY(lengthY); + for (size_t i = 0; i < lengthY; i++) { + if (!dimValA[i].isParam && !dimValB[i].isParam) { + size_t d = BinaryOperatorTrait::Func(dimValA[i].dim, dimValB[i].dim); + dimValY[i] = Dim{d}; + } else { + auto res = BinaryOperatorTrait::Op(dimValA[i].GetVal(), dimValB[i].GetVal()); + dimValY[i] = Dim{res, static_cast(-1)}; + } + } + model.AddShapeTensor(fNY,dimValY, fShapeY.empty()); // cannot be a scalar + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertDimShapeToString(dimValY) << " (shape)" << std::endl; + } + // no code needs to be generated (flag this as a constant output tensor) + fIsOutputConstant = true; + + } else { + // case of defined and non-constant tensors + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << std::endl; + } + // we convert non-dim shapes to Dim shapes + fDimShapeY = ConvertShapeToDim(fShapeY); + } + } else { + // case A or B have dynamic shapes. We need to broadcast if shape are not same + auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeA, fDimShapeB); + fBroadcastFlag = ret.first; + fDimShapeY = ret.second; + // case of all parametric shapes and MultiDirectionalBroadcastShape return the max of the 2 + // need to do before we declare the output tensor shape and the broadcasted ones + if (ret.first & 4) { + // check if one of the parameter is an input dimension + // define function to find this + auto IsInputDimParam = [&](const std::string &p) { + auto inputNames = model.GetInputTensorNames(); + for (auto &input : inputNames) { + for (auto &i_s : model.GetDimTensorShape(input)) { + if (i_s.isParam && i_s.param == p) + return true; + } + } + return false; + }; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + auto &s = fDimShapeY[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + if (IsInputDimParam(fDimShapeA[i].param)) { + // case dim is 1 we indicate that the input parameter is equal to 1 + if (fDimShapeA[i].dim != 1) + s = fDimShapeA[i]; + else + s = fDimShapeB[i]; + } else if (IsInputDimParam(fDimShapeB[i].param)) { + if (fDimShapeB[i].dim != 1) + s = fDimShapeB[i]; + else + s = fDimShapeA[i]; + } + } + } + } + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fDimShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << ConvertDimShapeToString(fDimShapeA) << " , " + << ConvertDimShapeToString(fDimShapeB) << " --> " << ConvertDimShapeToString(fDimShapeY) << std::endl; + } + } + } + + std::string GenerateInitCode() override + { + std::stringstream out; + return out.str(); + } + + std::string Generate(std::string opName) override + { + + if (fIsOutputConstant) + return ""; + + opName = "op_" + opName; + + std::stringstream out; + out << SP << "\n//------ " << opName << " " << BinaryOperatorTrait::Name() << " --> " + << ConvertDimShapeToString(fDimShapeY) << "\n"; + auto length = ConvertDimShapeToLength(fDimShapeY); + std::string typeName = TensorType::Name(); + + // we need to check if we can broadcast (case flag has bit 4 set) + + if (fBroadcastFlag & 4) { + // need to check if shapes are the same + auto lengthA = ConvertDimShapeToLength(fDimShapeA); + auto lengthB = ConvertDimShapeToLength(fDimShapeB); + out << SP << "if (" << lengthA << "!=" << lengthB << ") {\n"; + // check if A->B or B->A + // bool broadcastable = true; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + if (fBroadcastFlag & 5 && fDimShapeY[i] == fDimShapeA[i] && fDimShapeA[i].dim > 1 && + fDimShapeB[i].isParam) { + // B->A B[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeB[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast B->A in operator " + << opName << "\");\n"; + } + if (fBroadcastFlag & 6 && fDimShapeY[i] == fDimShapeB[i] && fDimShapeB[i].dim > 1 && + fDimShapeA[i].isParam) { + // A-> B A[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeA[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast A->B in operator " + << opName << "\");\n"; + } else if (fDimShapeA[i].isParam && fDimShapeB[i].isParam) { + // both shapes are parametric and we broadcast to maximum + // we allocate here output vector + out << SP << SP << "if (" << fDimShapeA[i] << " != " << fDimShapeB[i] << " && (" << fDimShapeA[i] + << " != 1 || " << fDimShapeB[i] << " != 1))\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast shapes in operator " << opName + << "\");\n"; + } + } + out << SP << "}\n"; + } + + auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + + std::string compute_idx_A, compute_idx_B, compute_idx_Y; + if (fDimShapeA.empty() || + std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_A = "0"; + } else { + for (size_t i = 0; i < fDimShapeA.size(); ++i) { + if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1") + continue; + compute_idx_A += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size())); + if (stridesA[i].GetVal() != "1") + compute_idx_A += " * " + stridesA[i].GetVal(); + compute_idx_A += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_A.pop_back(); + } + if (fDimShapeB.empty() || + std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_B = "0"; + } else { + for (size_t i = 0; i < fDimShapeB.size(); ++i) { + if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1") + continue; + compute_idx_B += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size())); + if (stridesB[i].GetVal() != "1") + compute_idx_B += " * " + stridesB[i].GetVal(); + compute_idx_B += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_B.pop_back(); + } + int nloop = 0; + if (fDimShapeY.empty() || + std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_Y = "0"; + } else { + for (size_t i = 0; i < fDimShapeY.size(); ++i) { + if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i] + << "; ++idx_" << i << "){\n"; + compute_idx_Y += "idx_" + std::to_string(i); + if (stridesY[i].GetVal() != "1") + compute_idx_Y += " * " + stridesY[i].GetVal(); + compute_idx_Y += " + "; + } + } + // remove last 3 characters " + " + for (int j = 0; j < 3; j++) + compute_idx_Y.pop_back(); + } + for (int j = 0; j < nloop + 1; j++) out << SP; + out << "tensor_" << fNY << "[" << compute_idx_Y << "] = " + << BinaryOperatorTrait::Op("tensor_" + fNA + "[" + compute_idx_A + "]", + "tensor_" + fNB + "[" + compute_idx_B + "]") + << " ;\n"; + + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ "+opName+"_"+BinaryOperatorTrait::Name()+"_KERNEL_ALPAKA\n"; + op += SP + "struct Binary"+opName+BinaryOperatorTrait::Name()+"Kernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * A, T const * B, T * C) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < " + std::to_string(ConvertShapeToLength(fShapeY)) + ") {\n"; + auto stridesA = UTILITY::ComputeStrideFromShape(fShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fShapeB); + + for(size_t id_s = 0; id_s < stridesA.size(); ++id_s){ + if(fShapeA[id_s] == 1) + stridesA[id_s] = 0; + } + + for(size_t id_s = 0; id_s < stridesB.size(); ++id_s){ + if(fShapeB[id_s] == 1) + stridesB[id_s] = 0; + } + + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + std::string flattened_index_A = ""; + std::string flattened_index_B = ""; + std::string temp = "idx"; + + for (size_t id_s = 0; id_s < fShapeA.size(); ++id_s) { + + auto strideY = stridesY[id_s]; + auto strideA = stridesA[id_s]; + + // coord expression + std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")"; + + // accumulate into final index + flattened_index_A += coord + " * " + std::to_string(strideA) + " + "; + + // update temp correctly + temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")"; + } + + // remove trailing " + " + if (!flattened_index_A.empty()) + flattened_index_A.erase(flattened_index_A.size() - 3); + + temp = "idx"; + + for (size_t id_s = 0; id_s < fShapeB.size(); ++id_s) { + + auto strideY = stridesY[id_s]; + auto strideB = stridesB[id_s]; + + // coord expression + std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")"; + + // accumulate into final index + flattened_index_B += coord + " * " + std::to_string(strideB) + " + "; + + // update temp correctly + temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")"; + } + + // remove trailing " + " + if (!flattened_index_B.empty()) + flattened_index_B.erase(flattened_index_B.size() - 3); + + + op += "C[idx] = " + BinaryOperatorTrait::Op("A["+flattened_index_A+"]", "B["+flattened_index_B+"]") + ";\n"; + op += "}\n}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + return SP + "Binary"+OpName+BinaryOperatorTrait::Name()+"Kernel binary" + OpName + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + if (fDimShapeY.empty()) { + throw std::runtime_error("SOFIE Operator Basic Binary called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fDimShapeY); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY + << ", binary" << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA + << "), alpaka::getPtrNative(deviceBuf_" << fNB << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + std::vector GetStdLibs() override + { + if (Op == EBasicBinaryOperator::Pow) { + return {std::string("cmath")}; + } else { + return {}; + } + } + + +}; + +} // namespace SOFIE + +#endif // SOFIE_ROperator_BasicBinary diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx b/core/inc/SOFIE/ROperator_BasicNary.hxx similarity index 96% rename from src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx rename to core/inc/SOFIE/ROperator_BasicNary.hxx index cbe0497..dad27da 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx +++ b/core/inc/SOFIE/ROperator_BasicNary.hxx @@ -119,7 +119,7 @@ public: void Initialize(RModel& model) override { for (auto &it : fNInputs) { if (!model.CheckIfTensorAlreadyExist(it)) { - throw std::runtime_error("TMVA SOFIE BasicNary Op Input Tensor " + it + " is not found in model"); + throw std::runtime_error("SOFIE BasicNary Op Input Tensor " + it + " is not found in model"); } fShapeInputs.push_back(model.GetTensorShape(it)); } @@ -145,7 +145,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE BasicNary called to Generate without being initialized first"); + throw std::runtime_error("SOFIE BasicNary called to Generate without being initialized first"); } std::stringstream out; size_t length = ConvertShapeToLength(fShapeY); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx b/core/inc/SOFIE/ROperator_BasicUnary.hxx similarity index 50% rename from src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx rename to core/inc/SOFIE/ROperator_BasicUnary.hxx index c18c17e..05b861a 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx +++ b/core/inc/SOFIE/ROperator_BasicUnary.hxx @@ -8,7 +8,7 @@ namespace SOFIE { -enum class EBasicUnaryOperator { kReciprocal, kSqrt , kNeg, kExp, kLog, kSin, kCos, kAbs }; +enum class EBasicUnaryOperator { kReciprocal, kSqrt , kNeg, kExp, kLog, kSin, kCos, kAbs, kSoftplus, kAtan, kFloor }; template struct UnaryOpTraits { @@ -62,6 +62,24 @@ struct UnaryOpTraits { static std::string Op(const std::string &X) { return "std::abs(" + X + ")"; } }; +template +struct UnaryOpTraits { + static std::string Name() { return "Softplus"; } + static std::string Op(const std::string &X) { return "std::log(std::exp(" + X + ") + 1)"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Atan"; } + static std::string Op(const std::string &X) { return "std::atan(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Floor"; } + static std::string Op(const std::string &X) { return "std::floor(" + X + ")"; } +}; + template class ROperator_BasicUnary final : public ROperator { private: @@ -77,6 +95,33 @@ public: ROperator_BasicUnary(std::string nameX, std::string nameY) : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) { + + switch(Op) { + case EBasicUnaryOperator::kReciprocal: + fKind = OperatorKind::UNARY_RECIPROCAL; + break; + case EBasicUnaryOperator::kSqrt: + fKind = OperatorKind::UNARY_SQRT; + break; + case EBasicUnaryOperator::kNeg: + fKind = OperatorKind::UNARY_NEG; + break; + case EBasicUnaryOperator::kExp: + fKind = OperatorKind::UNARY_EXP; + break; + case EBasicUnaryOperator::kLog: + fKind = OperatorKind::UNARY_LOG; + break; + case EBasicUnaryOperator::kSin: + fKind = OperatorKind::UNARY_SIN; + break; + case EBasicUnaryOperator::kCos: + fKind = OperatorKind::UNARY_COS; + break; + case EBasicUnaryOperator::kAbs: + fKind = OperatorKind::UNARY_ABS; + break; + } fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; } @@ -107,6 +152,42 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*OpName*/) override { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ " + UnaryOpTraits::Name() + "_KERNEL_ALPAKA\n"; + op += SP + "struct Unary" + UnaryOpTraits::Name() + "Kernel{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * data, T * output, std::size_t const length) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < length) {\n"; + op += SP + SP + SP + "output[idx] = " +UnaryOpTraits::Op("data[idx]") + ";\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*OpName*/) override { + return SP + "Unary" + UnaryOpTraits::Name() + "Kernel " + UnaryOpTraits::Name() + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + std::stringstream out; + auto length = ConvertShapeToLength(fShapeX); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY + << ", " << UnaryOpTraits::Name() << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), " << length << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + std::vector GetStdLibs() override { if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) { return { std::string("cmath") }; @@ -114,6 +195,11 @@ public: return {}; } } + + bool IsElementwise() const override { return !fIsOutputConstant; } + std::string GetElementwiseExpr(const std::string& v) const override { + return UnaryOpTraits::Op(v); + } }; } // namespace SOFIE diff --git a/core/inc/SOFIE/ROperator_Basic_Is.hxx b/core/inc/SOFIE/ROperator_Basic_Is.hxx new file mode 100644 index 0000000..fabe976 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Basic_Is.hxx @@ -0,0 +1,145 @@ +#ifndef SOFIE_ROPERATOR_BASIC_IS +#define SOFIE_ROPERATOR_BASIC_IS + +#include +#include +#include +#include + +namespace SOFIE { + +enum class EBasicIsOperator { kIsInf, kIsInfPos, kIsInfNeg, kIsNaN }; + +template +struct IsOpTraits { +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsInf"; } + static std::string Op(const std::string &x) { return "std::isinf(" + x + ")"; } +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsInfPos"; } + static std::string Op(const std::string &x) { return "(std::isinf(" + x + ") && " + x + " > 0)"; } +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsInfNeg"; } + static std::string Op(const std::string &x) { return "(std::isinf(" + x + ") && " + x + " < 0)"; } +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsNaN"; } + static std::string Op(const std::string &x) { return "std::isnan(" + x + ")"; } +}; + + +template +class ROperator_Basic_Is final : public ROperator { +private: + std::string fNX; + std::string fNY; + + std::vector fShapeX; + std::vector fShapeY; + +public: + ROperator_Basic_Is() {} + + ROperator_Basic_Is(std::string nameX, std::string nameY) + : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); + } + fShapeX = model.GetDimTensorShape(fNX); + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY); + } + + std::string Generate(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + + out << SP << "\n//---- Operator " << IsOpTraits::Name() << " " << opName << "\n"; + auto length = ConvertDimShapeToLength(fShapeX); + out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n"; + out << SP << SP << "tensor_" << fNY << "[i] = " << IsOpTraits::Op("tensor_" + fNX + "[i]") << ";\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override + { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ " + IsOpTraits::Name() + "_KERNEL_ALPAKA\n"; + op += SP + "struct Is" + IsOpTraits::Name() + "Kernel {\n"; + op += SP + SP + "template\n"; + // Output is uint8_t (bool storage), input is T (float/double). + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const & acc,\n"; + op += SP + SP + SP + "T const * data,\n"; + op += SP + SP + SP + "uint8_t * output,\n"; + op += SP + SP + SP + "std::size_t const length) const\n"; + op += SP + SP + "{\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < length) {\n"; + op += SP + SP + SP + SP + "output[idx] = static_cast(" + IsOpTraits::Op("data[idx]") + ");\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override + { + return SP + "Is" + IsOpTraits::Name() + "Kernel " + IsOpTraits::Name() + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + auto length = ConvertDimShapeToLength(fShapeX); + + out << "\n//------ " << opName << "_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << opName + << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", " << IsOpTraits::Name() << "Kernel" + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", " << length << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return !fIsOutputConstant; } + std::string GetElementwiseExpr(const std::string& v) const override { + return IsOpTraits::Op(v); + } + + std::vector GetStdLibs() override { + return { std::string("cmath") }; + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_BASIC_IS diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/core/inc/SOFIE/ROperator_BatchNormalization.hxx similarity index 65% rename from src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx rename to core/inc/SOFIE/ROperator_BatchNormalization.hxx index a27cea4..8bc3b3d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx +++ b/core/inc/SOFIE/ROperator_BatchNormalization.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_ROPERATOR_BatchNormalization #define SOFIE_ROPERATOR_BatchNormalization -#include "SOFIE_common.hxx" -#include "ROperator.hxx" -#include "RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include @@ -59,7 +59,7 @@ public: } else{ throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a BatchNormalization operator"); + std::runtime_error("SOFIE Encountered unsupported type parsing a BatchNormalization operator"); } } @@ -72,12 +72,12 @@ public: std::vector> ShapeInference(std::vector> input) override { if (input.size() != 5 ) { throw - std::runtime_error("TMVA SOFIE BatchNormalization Op Shape inference need 5 input tensors"); + std::runtime_error("SOFIE BatchNormalization Op Shape inference need 5 input tensors"); } for(size_t i = 0; i < input.size(); i++) { if (input[i].size() != 4) { throw - std::runtime_error("TMVA SOFIE BatchNormalization Op Shape inference only accept tensor with 4 dimensions"); + std::runtime_error("SOFIE BatchNormalization Op Shape inference only accept tensor with 4 dimensions"); } } @@ -88,30 +88,30 @@ public: void Initialize(RModel& model) override { if (!model.CheckIfTensorAlreadyExist(fNX)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNX + " fnx is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNX + " fnx is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNScale)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNScale + " fns is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNScale + " fns is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNB)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNB + " fnb is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNB + " fnb is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNMean)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNMean + " fnm is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNMean + " fnm is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNVar)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNVar + " fnv is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNVar + " fnv is not found in model"); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() < 2 || fShapeX.size() > 4) { throw - std::runtime_error("TMVA SOFIE BatchNormalization Op input tensor " + fNX + " fnx has wrong shape : " + ConvertShapeToString(fShapeX)); + std::runtime_error("SOFIE BatchNormalization Op input tensor " + fNX + " fnx has wrong shape : " + ConvertShapeToString(fShapeX)); } fShapeScale = model.GetTensorShape(fNScale); @@ -185,7 +185,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShapeX.empty()){ - throw std::runtime_error("TMVA SOFIE Batch Normalization called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Batch Normalization called to Generate without being initialized first"); } std::stringstream out; @@ -227,6 +227,80 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("SOFIE BatchNormalization called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + + std::string kname = "BatchNormKernel_" + opName; + std::string op; + op = "\n//------ BATCHNORM_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ X,\n"; + op += SP + SP + SP + "T const* __restrict__ scale,\n"; + op += SP + SP + SP + "T const* __restrict__ bias,\n"; + op += SP + SP + SP + "T const* __restrict__ mean,\n"; + op += SP + SP + SP + "T* __restrict__ Y,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t i = global_thread_idx; i < totalElements; i += grid_thread_extent) {\n"; + + op += SP + SP + SP + SP + "T val = (X[i] - mean[i]) * scale[i] + bias[i];\n"; + + if (fActivation == EActivationType::RELU) + op += SP + SP + SP + SP + "Y[i] = val > static_cast(0) ? val : static_cast(0);\n"; + else + op += SP + SP + SP + SP + "Y[i] = val;\n"; + + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "BatchNormKernel_" + opName; + return SP + kname + " batchNormKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("SOFIE BatchNormalization called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "batchNormKernel_" + opName; + + std::stringstream out; + out << "\n//------ BATCHNORM_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + + out << SP << "auto task_" << fNY << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNScale << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNMean << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::enqueue(queue, task_" << fNY << ");\n"; + + return out.str(); + } + std::vector GetBlasRoutines() override { return { std::string("Copy"), std::string("Axpy") }; } }; diff --git a/core/inc/SOFIE/ROperator_Cast.hxx b/core/inc/SOFIE/ROperator_Cast.hxx new file mode 100644 index 0000000..84d0048 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Cast.hxx @@ -0,0 +1,167 @@ +#ifndef SOFIE_ROPERATOR_Cast +#define SOFIE_ROPERATOR_Cast + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +std::vector convertToInt64(const In* src, size_t n) { + std::vector dst(n); + std::transform(src, src + n, dst.begin(), + [](In v) { return static_cast(v); }); + return dst; +} + + +class ROperator_Cast final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + ETensorType fType; + +public: + ROperator_Cast(){} + ROperator_Cast(ETensorType type,std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), + fType(type) + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + //input must be a graph input, or already initialized intermediate tensor + if (model.CheckIfTensorAlreadyExist(fNX) == false){ + throw std::runtime_error("SOFIE Cast Op Input Tensor is not found in model"); + } + fShape = model.GetDimTensorShape(fNX); + // should we add a check if the same type + auto inputType = model.GetTensorType(fNX); + if (model.IsInitializedTensor(fNX)) { + fIsOutputConstant = true; + auto inputData = model.GetInitializedTensorData(fNX); + if (fType == ETensorType::INT64) { + size_t length = ConvertShapeToLength(fShape); + std::vector convertedData; + if (inputType == ETensorType::FLOAT) { + convertedData = convertToInt64(static_cast(inputData.get()), length); + } else if (inputType == ETensorType::DOUBLE) { + convertedData = convertToInt64(static_cast(inputData.get()), length); + } else if (inputType == ETensorType::INT32) { + convertedData = convertToInt64(static_cast(inputData.get()), length); + } else { + // Already INT64 — safe direct copy + convertedData.assign(static_cast(inputData.get()), + static_cast(inputData.get()) + length); + } + model.AddConstantTensor(fNY, ConvertShapeToInt(fShape), convertedData.data()); + model.SetNotWritableInitializedTensor(fNX); + } + else + fIsOutputConstant = false; + } else if (model.IsShapeTensor(fNX) && fType == ETensorType::INT64) { + auto shapeData = model.GetShapeTensorValues(fNX); + model.AddShapeTensor(fNY, shapeData, fShape.size() == 0); + fIsOutputConstant = true; + } + if (!fIsOutputConstant) + model.AddIntermediateTensor(fNY, fType, fShape); + if (model.Verbose()) { + std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << ConvertTypeToString(fType); + if (fType == ETensorType::BOOL) std::cout << " (converted from BOOL) "; + std::cout << " for " << fNY << " shape " << ConvertDimShapeToString(fShape); + if (fIsOutputConstant) std::cout << " (constant) "; + std::cout << std::endl; + } + } + + + std::string Generate(std::string opName) override { + + // output shape can be empty if is a scalar + + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + + out << "\n//------ CAST " << opName << " ---> " << fNY << " " << ConvertDimShapeToString(fShape) << "\n"; + // no generated code for constant outputs + if (fIsOutputConstant) return out.str(); + + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + + // need to handle bool case separatly since casting to uint8 will not give right result + if (fType == ETensorType::BOOL) + out << SP << SP << "tensor_" << fNY << "[id] = (tensor_" << fNX << "[id] != 0) ? 1 : 0;\n"; + else + out << SP << SP << "tensor_" << fNY << "[id] = static_cast<"<< ConvertTypeToString(fType) << ">(tensor_" << fNX << "[id]);\n"; + + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + std::string op; + op = "\n//------ CAST_KERNEL_ALPAKA\n"; + op += SP + "struct CastKernel"+opName+"{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, SrcT const * src, DstT * dst, std::size_t numElements) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n"; + op += SP + SP + SP + "dst[i] = static_cast(src[i]);\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + return SP + "CastKernel"+opName+" castKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fIsOutputConstant) return ""; + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Cast called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + out << "\n//------ CAST_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY << ", castKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << ")); \n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "static_cast<" + ConvertTypeToString(fType) + ">(" + v + ")"; + } + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_Cast diff --git a/core/inc/SOFIE/ROperator_Clip.hxx b/core/inc/SOFIE/ROperator_Clip.hxx new file mode 100644 index 0000000..0439b50 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Clip.hxx @@ -0,0 +1,363 @@ +#ifndef SOFIE_ROPERATOR_CLIP +#define SOFIE_ROPERATOR_CLIP + +#include "SOFIE_common.hxx" +#include "ROperator.hxx" +#include "RModel.hxx" + +#include +#include +#include +#include +#include + +namespace SOFIE { + +// --------------------------------------------------------------------------- +// ROperator_Clip +// +// ONNX spec: Y = max(min_val, min(max_val, X)) element-wise +// +// The min and max bounds are optional in the ONNX spec: +// - if fNMin is empty → no lower clipping (effectively -inf) +// - if fNMax is empty → no upper clipping (effectively +inf) +// +// Bounds can be provided either as: +// (a) initializer / constant tensors (scalar, shape []), +// (b) runtime input tensors (resolved at Generate time), +// (c) compile-time float literals (via the fMin / fMax attributes). +// +// The implementation follows the Selu operator style exactly: +// - static shape stored in fShape +// - dynamic shape stored in fDimShape +// - a flat loop over all elements in Generate() +// --------------------------------------------------------------------------- + +template +class ROperator_Clip final : public ROperator { +private: + + // Tensor names + std::string fNX; // input data + std::string fNY; // output + std::string fNMin; // optional: tensor name for min bound + std::string fNMax; // optional: tensor name for max bound + + + // Static shape (non-dynamic path, mirrors Selu) + std::vector fShape; + + // Dynamic shape (Dim-aware, for dynamic input tensors) + std::vector fDimShape; + bool fIsDynamic = false; + + // Compile-time bound values — used when bounds are constant tensors + // Initialised to the ONNX defaults (no clipping) + T fMin = std::numeric_limits::lowest(); // -inf equivalent + T fMax = std::numeric_limits::max(); // +inf equivalent + + // Flags indicating whether each bound is: + // - absent (no input provided) + // - a constant resolved at Initialize time + // - a runtime tensor that must be read in the generated code + bool fHasMin = false; + bool fHasMax = false; + bool fMinIsConstant = false; + bool fMaxIsConstant = false; + +public: + + ROperator_Clip() {} + + // Constructor for the common case where bounds are tensor inputs + // (follows ONNX node input order: X, min, max) + ROperator_Clip(std::string nameX, + std::string nameY, + std::string nameMin = "", + std::string nameMax = "") + : fNX (UTILITY::Clean_name(nameX)), + fNY (UTILITY::Clean_name(nameY)), + fNMin(nameMin.empty() ? "" : UTILITY::Clean_name(nameMin)), + fNMax(nameMax.empty() ? "" : UTILITY::Clean_name(nameMax)) + { + fInputTensorNames = { fNX }; + if (!fNMin.empty()) fInputTensorNames.push_back(fNMin); + if (!fNMax.empty()) fInputTensorNames.push_back(fNMax); + fOutputTensorNames = { fNY }; + } + + // Convenience constructor when bounds are known scalars at model-build time + ROperator_Clip(std::string nameX, + std::string nameY, + T minVal, + T maxVal) + : fNX (UTILITY::Clean_name(nameX)), + fNY (UTILITY::Clean_name(nameY)), + fMin(minVal), fMax(maxVal), + fHasMin(true), fHasMax(true), + fMinIsConstant(true), fMaxIsConstant(true) + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + + // ----------------------------------------------------------------------- + void Initialize(RModel& model) override + { + // ---- validate main input ------------------------------------------ + if (!model.CheckIfTensorAlreadyExist(fNX)) + throw std::runtime_error( + "SOFIE Clip Op Input Tensor " + fNX + " is not found in model"); + + // ---- collect shape (static or dynamic, mirrors BasicBinary) ------- + if (model.IsDynamicTensor(fNX)) { + fIsDynamic = true; + fDimShape = model.GetDynamicTensorShape(fNX); + } else { + fShape = model.GetTensorShape(fNX); + fDimShape = ConvertShapeToDim(fShape); + } + + // ---- resolve min bound -------------------------------------------- + if (!fNMin.empty() && model.CheckIfTensorAlreadyExist(fNMin)) { + fHasMin = true; + if (model.IsInitializedTensor(fNMin)) { + // constant scalar tensor — read value now + auto data = static_cast(model.GetInitializedTensorData(fNMin).get()); + fMin = data[0]; + fMinIsConstant = true; + model.SetNotWritableInitializedTensor(fNMin); + } + // else: runtime input — will be dereferenced in generated code + } + + // ---- resolve max bound -------------------------------------------- + if (!fNMax.empty() && model.CheckIfTensorAlreadyExist(fNMax)) { + fHasMax = true; + if (model.IsInitializedTensor(fNMax)) { + auto data = static_cast(model.GetInitializedTensorData(fNMax).get()); + fMax = data[0]; + fMaxIsConstant = true; + model.SetNotWritableInitializedTensor(fNMax); + } + } + + // ---- register output tensor --------------------------------------- + if (fIsDynamic) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fDimShape); + else + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + + if (model.Verbose()) { + std::cout << "Clip : " << fNX << " " + << ConvertShapeToString(fShape); + if (fHasMin) + std::cout << " min=" << (fMinIsConstant + ? std::to_string(fMin) : fNMin + "(runtime)"); + if (fHasMax) + std::cout << " max=" << (fMaxIsConstant + ? std::to_string(fMax) : fNMax + "(runtime)"); + std::cout << " --> " << fNY << "\n"; + } + + // only needs and — no cmath + model.AddNeededStdLib("algorithm"); + model.AddNeededStdLib("limits"); + } + + + // ----------------------------------------------------------------------- + // GPU ALPAKA + // ----------------------------------------------------------------------- + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override + { + std::string op; + op = "\n//------ CLIP_KERNEL_ALPAKA\n"; + op += "struct ClipKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + "TAcc const & acc,\n"; + op += SP + SP + "T const * __restrict__ data,\n"; + op += SP + SP + "T * __restrict__ out,\n"; + op += SP + SP + "std::size_t numElements,\n"; + op += SP + SP + "T minVal,\n"; + op += SP + SP + "T maxVal) const\n"; + op += SP + "{\n"; + op += SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "T val = data[idx];\n"; + op += SP + SP + SP + "val = val < minVal ? minVal : val;\n"; + op += SP + SP + SP + "out[idx] = val > maxVal ? maxVal : val;\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override + { + return "ClipKernel clipKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override + { + OpName = "op_" + OpName; + if (fShape.empty() && fDimShape.empty()) + throw std::runtime_error( + "SOFIE Operator Clip called to Generate_GPU_ALPAKA without being initialized first"); + + std::stringstream out; + out << "\n//------ CLIP_GPU_ALPAKA " << OpName << "\n"; + + std::string length = ConvertDimShapeToLength(fDimShape); + + std::string minExpr, maxExpr; + if (fMinIsConstant) { + minExpr = ToStringHighPrec(fMin); + } else if (fHasMin) { + throw std::runtime_error( + "SOFIE Clip GPU ALPAKA: runtime (non-constant) min bound is not supported in GPU path"); + } else { + minExpr = "std::numeric_limits<" + TensorType::Name() + ">::lowest()"; + } + + if (fMaxIsConstant) { + maxExpr = ToStringHighPrec(fMax); + } else if (fHasMax) { + throw std::runtime_error( + "SOFIE Clip GPU ALPAKA: runtime (non-constant) max bound is not supported in GPU path"); + } else { + maxExpr = "std::numeric_limits<" + TensorType::Name() + ">::max()"; + } + + std::string castMin = "static_cast<" + TensorType::Name() + ">(" + minExpr + ")"; + std::string castMax = "static_cast<" + TensorType::Name() + ">(" + maxExpr + ")"; + + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << OpName + << " = alpaka::createTaskKernel(workDiv_" << fNY << ", clipKernel" + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << length << ")" + << ", " << castMin << ", " << castMax << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return true; } + + std::string GetElementwiseExpr(const std::string& v) const override + { + std::string minExpr, maxExpr; + if (fMinIsConstant) minExpr = ToStringHighPrec(fMin); + else if (fHasMin) minExpr = "tensor_" + fNMin + "[0]"; + else minExpr = "std::numeric_limits<" + TensorType::Name() + ">::lowest()"; + + if (fMaxIsConstant) maxExpr = ToStringHighPrec(fMax); + else if (fHasMax) maxExpr = "tensor_" + fNMax + "[0]"; + else maxExpr = "std::numeric_limits<" + TensorType::Name() + ">::max()"; + + std::string expr = fHasMax || fMaxIsConstant ? "std::min(" + maxExpr + ", " + v + ")" : v; + if (fHasMin || fMinIsConstant) + expr = "std::max(" + minExpr + ", " + expr + ")"; + return expr; + } + + std::string GetFusableOutputTensorName() override { return fNY; } + + void UpdateFusableTensorName(std::string fusable_tensor_name, + const std::function& removal_func) override + { + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + + // ----------------------------------------------------------------------- + // Generate + // ----------------------------------------------------------------------- + std::string Generate(std::string OpName) override + { + OpName = "op_" + OpName; + + if (fShape.empty() && fDimShape.empty()) + throw std::runtime_error( + "SOFIE Operator Clip called to Generate without being initialized first"); + + std::stringstream out; + out << SP << "\n//------ CLIP " << OpName << "\n"; + + // ---- build the length expression (static or dynamic) ------------- + std::string length = ConvertDimShapeToLength(fDimShape); + + // ---- build min/max expressions for the generated code ------------ + // + // Priority: + // 1. compile-time constant value → emit literal + // 2. runtime input tensor → emit tensor_[0] (scalar) + // 3. not provided → emit numeric_limits extreme + // + std::string minExpr, maxExpr; + + if (fMinIsConstant) { + minExpr = ToStringHighPrec(fMin); + } else if (fHasMin) { + minExpr = "tensor_" + fNMin + "[0]"; // scalar input tensor + } else { + // No lower bound — use lowest representable value + minExpr = "std::numeric_limits<" + TensorType::Name() + + ">::lowest()"; + } + + if (fMaxIsConstant) { + maxExpr = ToStringHighPrec(fMax); + } else if (fHasMax) { + maxExpr = "tensor_" + fNMax + "[0]"; + } else { + // No upper bound — use max representable value + maxExpr = "std::numeric_limits<" + TensorType::Name() + + ">::max()"; + } + + auto tensorValue = [](const std::string & name, const std::string & index) { + std::stringstream s; + s << "tensor_" << name << "[" << index << "]"; + return s.str(); + }; + + // ---- flat element loop (identical structure to Selu) ------------- + out << SP << "for (int id = 0; id < " << length << " ; id++) {\n"; + std::string firstExpr = fHasMax ? "std::min(" + maxExpr + ", " + tensorValue(fNX, "id") + ")" : tensorValue(fNX, "id"); + std::string secondExpr = fHasMin ? "std::max(" + minExpr + ", " + firstExpr + ")" : firstExpr; + out << SP << SP << tensorValue(fNY, "id") << " = " << secondExpr << ";\n"; + out << SP << "}\n"; + + return out.str(); + } + + +private: + + // Helper: convert a T value to string with enough precision + std::string ToStringHighPrec(T val) const { + std::ostringstream ss; + ss << std::setprecision(std::numeric_limits::max_digits10) << val; + // add dot if missing + if (ss.str().find(".") == std::string::npos) ss << "."; + // append 'f' suffix for float literals so generated code compiles + // cleanly without implicit double→float conversion warnings + if (std::is_same::value) ss << "f"; + return ss.str(); + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_CLIP \ No newline at end of file diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx b/core/inc/SOFIE/ROperator_Comparision.hxx similarity index 58% rename from src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx rename to core/inc/SOFIE/ROperator_Comparision.hxx index 7648a9a..db7b9e6 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx +++ b/core/inc/SOFIE/ROperator_Comparision.hxx @@ -1,4 +1,3 @@ - #ifndef SOFIE_ROperator_Comparision #define SOFIE_ROperator_Comparision @@ -73,30 +72,26 @@ public: ROperator_Comparision(){} ROperator_Comparision(const std::string & nameX1, const std::string & nameX2, const std::string & nameY): fNX1(UTILITY::Clean_name(nameX1)), fNX2(UTILITY::Clean_name(nameX2)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::COMPARISON; fInputTensorNames = { fNX1, fNX2 }; - - // output will be a boolean vector so should not be considered for memory optimized pool fOutputTensorNames = { fNY }; } - // type of output given input std::vector TypeInference(std::vector input) override { return input; } - // shape of output tensors given input tensors std::vector> ShapeInference(std::vector> input) override { - auto ret = input; // return vector size 1 with first input + auto ret = input; return ret; } void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor if (!model.CheckIfTensorAlreadyExist(fNX1)){ - throw std::runtime_error(std::string("TMVA SOFIE Comparision Op Input Tensor ") + fNX1 + "is not found in model"); + throw std::runtime_error(std::string("SOFIE Comparision Op Input Tensor ") + fNX1 + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNX2)) { - throw std::runtime_error(std::string("TMVA SOFIE Comparision Op Input Tensor ") + fNX2 + "is not found in model"); + throw std::runtime_error(std::string("SOFIE Comparision Op Input Tensor ") + fNX2 + "is not found in model"); } fShapeX1 = model.GetTensorShape(fNX1); fShapeX2 = model.GetTensorShape(fNX2); @@ -104,38 +99,31 @@ public: fTensorType2 = model.GetTensorType(fNX2); bool broadcast = !UTILITY::AreSameShape(fShapeX1, fShapeX2); if (broadcast) { - // Y is the common shape of A and B fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2); bool broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY); bool broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY); - // Broadcast A to Y if (broadcastX1) { if (model.IsInitializedTensor(fNX1)) { auto data = model.GetInitializedTensorData(fNX1); std::shared_ptr broadcastedData( UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX1, fShapeY), std::default_delete()); - // Update the data and the shape of A model.UpdateInitializedTensor(fNX1, model.GetTensorType(fNX1), fShapeY, broadcastedData); fShapeX1 = fShapeY; } else { - // Add an intermediate tensor for broadcasting A fNBroadcastedX1 = "Broadcasted" + fNX1; model.AddIntermediateTensor(fNBroadcastedX1, model.GetTensorType(fNX1), fShapeY); } } - // Broadcast B to Y if (broadcastX2) { if (model.IsInitializedTensor(fNX2)) { auto data = model.GetInitializedTensorData(fNX2); std::shared_ptr broadcastedData( UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX2, fShapeY), std::default_delete()); - // Update the data and the shape of B model.UpdateInitializedTensor(fNX2, model.GetTensorType(fNX2), fShapeY, broadcastedData); fShapeX2 = fShapeY; } else { - // Add an intermediate tensor for broadcasting B fNBroadcastedX2 = "Broadcasted" + fNX2; model.AddIntermediateTensor(fNBroadcastedX2, model.GetTensorType(fNX2), fShapeY); } @@ -143,8 +131,7 @@ public: } else { fShapeY = fShapeX1; } - // case of constant tensors - if (model.IsInitializedTensor(fNX1) && model.IsInitializedTensor(fNX2) ) { + if (model.IsInitializedTensor(fNX1) && model.IsInitializedTensor(fNX2)) { fIsOutputConstant = true; auto data1 = static_cast(model.GetInitializedTensorData(fNX1).get()); auto data2 = static_cast(model.GetInitializedTensorData(fNX2).get()); @@ -158,9 +145,8 @@ public: << ConvertValuesToString(length,outData) << std::endl; delete [] outData; } else { - model.AddIntermediateTensor(fNY, ETensorType::BOOL , fShapeY); + model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY); } - // check if this is not output operators to add a specific line for definining the tensor_xxx variable const auto & outputTensorNames = model.GetOutputTensorNames(); fIsModelOutput = false; if (std::find(outputTensorNames.begin(), outputTensorNames.end(), fNY) != outputTensorNames.end()) @@ -170,14 +156,12 @@ public: std::string Generate(std::string OpName) override { if (fIsOutputConstant) return ""; OpName = "op_" + OpName; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Comparision Op called to Generate without being initialized first"); + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first"); } std::stringstream out; out << SP << "\n//------ " << ComparisionTrait::Name() << "\n"; size_t length = ConvertShapeToLength(fShapeY); - // Broadcast A if it's uninitialized if (!fNBroadcastedX1.empty()) { std::string type1 = ConvertTypeToString(fTensorType1); out << SP << "// Broadcasting uninitialized tensor " << fNX1 << "\n"; @@ -187,7 +171,6 @@ public: out << SP << SP << "delete[] data;\n"; out << SP << "}\n"; } - // Broadcast B if it's uninitialized if (!fNBroadcastedX2.empty()) { std::string type2 = ConvertTypeToString(fTensorType2); out << SP << "// Broadcasting uninitialized tensor " << fNX2 << "\n"; @@ -199,14 +182,126 @@ public: } const std::string& nameX1 = fNBroadcastedX1.empty()? fNX1 : fNBroadcastedX1; const std::string& nameX2 = fNBroadcastedX2.empty()? fNX2 : fNBroadcastedX2; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; out << SP << SP << "fTensor_" << fNY << "[id] = " << ComparisionTrait::Op( "tensor_" + nameX1 + "[id]" , "tensor_" + nameX2 + "[id]") << " ;\n"; out << SP << "}\n"; - // since output is a boolean need to add the tensor_xxx variable since it is not defined as a pointer to a boolean std::vector if (!fIsModelOutput) out << SP << "const std::vector & tensor_" << fNY << " = fTensor_" << fNY << ";\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first"); + + const std::size_t D = fShapeY.size(); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::vector shapeX1_padded(D, 1); + std::vector shapeX2_padded(D, 1); + { + size_t off1 = D - fShapeX1.size(); + for (size_t i = 0; i < fShapeX1.size(); ++i) + shapeX1_padded[off1 + i] = fShapeX1[i]; + size_t off2 = D - fShapeX2.size(); + for (size_t i = 0; i < fShapeX2.size(); ++i) + shapeX2_padded[off2 + i] = fShapeX2[i]; + } + + auto stridesX1 = UTILITY::ComputeStrideFromShape(shapeX1_padded); + auto stridesX2 = UTILITY::ComputeStrideFromShape(shapeX2_padded); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + std::string type1 = ConvertTypeToString(fTensorType1); + std::string type2 = ConvertTypeToString(fTensorType2); + std::string kname = "ComparisonKernel_" + opName; + std::string opname = ComparisionTrait::Name(); + + std::string op; + op = "\n//------ " + opname + "_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + type1 + " const* __restrict__ x1,\n"; + op += SP + SP + SP + type2 + " const* __restrict__ x2,\n"; + op += SP + SP + SP + "uint8_t* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const x1_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeX1_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesX1[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const x2_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeX2_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesX2[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = "+ ComparisionTrait::Op("x1[x1_idx]" , "x2[x2_idx]") + " ;\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "ComparisonKernel_" + opName; + return SP + kname + " comparisonKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "comparisonKernel_" + opName; + + std::stringstream out; + out << "\n//------ " << ComparisionTrait::Name() << "_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX1 << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNX2 << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); } diff --git a/core/inc/SOFIE/ROperator_Concat.hxx b/core/inc/SOFIE/ROperator_Concat.hxx new file mode 100644 index 0000000..f396554 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Concat.hxx @@ -0,0 +1,503 @@ +#ifndef SOFIE_ROPERATOR_Concat +#define SOFIE_ROPERATOR_Concat + + + #include "SOFIE/SOFIE_common.hxx" + #include "SOFIE/ROperator.hxx" + #include "SOFIE/RModel.hxx" + + #include + #include + #include + #include + #include + + namespace SOFIE{ + + class ROperator_Concat final : public ROperator + { + private: + int fAxis=0; + int fnewAxis=0; + std::vector fInputs; + std::string fOutput; + std::vectorfOutputShape; + std::vector fOutputShapeData; // in case output is a shape tensor we store here the output shape value data (can be parametric) + std::vector> fInputShapes; + ETensorType fInputType = ETensorType::UNDEFINED; + + public: + + ROperator_Concat(){} + ROperator_Concat(std::vector inputs, int axis, int newAxis, std::string output): + fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) { + fInputs.reserve(inputs.size()); + for (auto & name : inputs) + fInputs.push_back(UTILITY::Clean_name(name)); + + fInputTensorNames.resize(fInputs.size()); + std::transform(fInputs.begin(), fInputs.end(), fInputTensorNames.begin(), + [](const std::string& s) -> std::string_view { return s; }); + fOutputTensorNames = { fOutput }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + // get shape of output given inputs. It is going to be called after initialized + std::vector> ShapeInference(std::vector> inputs) override { + std::vector> ret(1); + // treat negative axis case + if (fAxis<0) { + fAxis = inputs[0].size()+fAxis; + } + if (fAxis < 0 || fAxis >= (int) inputs[0].size()) + throw std::runtime_error("SOFIE Concat Op - invalid axis value "); + + int concat_dim=0; + // case of Concat (fNewAxis = 0) and not ConcatFromSequence + if(fnewAxis == 0){ + for (size_t i = 0; i < inputs.size(); i++) { + if (i > 0 && inputs[i].size() != inputs[i - 1].size()) + throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + + ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i - 1])); + for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { + if ((int)iaxis == fAxis) + concat_dim += inputs[i][iaxis]; + else if (i > 0 && inputs[i][iaxis] != inputs[i - 1][iaxis]) + throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " + + ConvertShapeToString(inputs[i]) + " and " + + ConvertShapeToString(inputs[i - 1])); + } + } + + // output shape + ret[0] = inputs[0]; + ret[0][fAxis] = concat_dim; + } + std::vector stack; + // case ConCatFromSequence + if(fnewAxis == 1){ + for(size_t i = 0; i < inputs.size(); i++) { + if (i > 0 && inputs[i].size() != inputs[i-1].size() ) + throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + + ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i-1])); + for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { + if ((int) iaxis == fAxis) + stack.push_back(inputs[i][iaxis]); + else + if (i> 0 && inputs[i][iaxis] != inputs[i-1][iaxis]) + throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " + + ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i-1])); + } + + } + for(auto it:stack) + ret[0].push_back(it); + } + + return ret; + } + + // get shape of output given inputs. It is going to be called after initialized + std::vector ShapeInference(const std::vector> & inputs, const RModel & model) { + std::vector ret(inputs[0].size()); + // treat negative axis case + if (fAxis<0) { + fAxis = inputs[0].size()+fAxis; + } + if (fAxis < 0 || fAxis >= (int) inputs[0].size()) + throw std::runtime_error("SOFIE Concat Op - invalid axis value "); + + Dim concat_dim; + if(fnewAxis == 0){ + for (size_t i = 0; i < inputs.size(); i++) { + if (i > 0 && inputs[i].size() != inputs[i - 1].size()) + throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + + ConvertDimShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDimShapeToString(inputs[i - 1])); + for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { + if ((int)iaxis == fAxis) { + // support both integer and params shape for the concatenation axis + if (concat_dim.param.empty() && concat_dim.dim == 0) + concat_dim = inputs[i][iaxis]; + else if (inputs[i][iaxis].isParam || concat_dim.isParam) { + concat_dim = + Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(), + static_cast(-1)}; + } else { + concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim }; + } + } + else if (i == 0) { + ret[iaxis] = inputs[i][iaxis]; + } + else if ((!inputs[i][iaxis].isParam && !ret[iaxis].isParam) && (inputs[i][iaxis].dim != ret[iaxis].dim)) { + throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " + + ConvertDimShapeToString(inputs[i]) + " and " + + ConvertDimShapeToString(inputs[i - 1])); + } + else if (!inputs[i][iaxis].isParam && ret[iaxis].isParam){ + // if shape is not parametric use it + ret[iaxis] = inputs[i][iaxis]; + } + else if (inputs[i][iaxis].isParam && ret[iaxis].isParam) { + // check which parameter is first in RModel list + auto & dimNames = model.GetDimShapeNames(); + auto p1 = std::find(dimNames.begin(), dimNames.end(), inputs[i][iaxis].param); + auto p2 = std::find(dimNames.begin(), dimNames.end(), ret[iaxis].param); + if (p1 < p2) ret[iaxis] = inputs[i][iaxis]; + } + + } + // add parenthesis in case is an expression + if (concat_dim.isParam && concat_dim.dim == static_cast(-1)) + concat_dim = Dim{ std::string("(") + concat_dim.GetVal() + std::string(")"), concat_dim.dim }; + } + + // output shape for concatenated axis + ret[fAxis] = concat_dim; + + } + // case of stacking (not supported yet) + // here we need to check that input shapes are the same + // for example for fAxis == 0 + // output shapes: [inputs.size(), inputs[0][0], inputs[0][1],....] + if(fnewAxis == 1){ + throw std::runtime_error("SOFIE Concat Op - stacking (i.e. COncatFromSequence with new_axis=1) is not supported "); + } + return ret; + } + + void Initialize(RModel& model) override { + std::vector> inputIntShapes; + for (auto &it : fInputs) { + if (model.CheckIfTensorAlreadyExist(it) == false) { + throw std::runtime_error("SOFIE Concat Op Input Tensor " + it + " is not found in model"); + } + fInputShapes.push_back(model.GetDimTensorShape(it)); + if (!model.IsDynamicTensor(it)) { + inputIntShapes.push_back(ConvertShapeToInt(fInputShapes.back())); + } + } + if (inputIntShapes.size() == fInputs.size()) { + // if all input shapes are static we can compute output shape at initialization time + auto outputIntShape = ShapeInference(inputIntShapes)[0]; + fOutputShape = ConvertShapeToDim(outputIntShape); + if (model.Verbose()) + std::cout << "Initialize Concat operator with defined inputs shapes, " + << "output has shape " << ConvertShapeToString(outputIntShape) << std::endl; + + } else { + // if at least one input shape is dynamic we need to compute output shape using the symbolic expression for the dimensions + fOutputShape = ShapeInference(fInputShapes, model); + if (model.Verbose()) + std::cout << "Initialize Concat operator with dynamic inputs shapes, " + << "output has shape " << ConvertDimShapeToString(fOutputShape) << std::endl; + } + + // check if concat has constant inputs , axis 0(concat contigous memory and type is integer) + bool isOutputShape = false; + + // if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) { + fIsOutputConstant = true; + isOutputShape = true; + + for (auto &input : fInputs) { + if (model.IsDynamicTensor(input)) { + fIsOutputConstant = false; + isOutputShape = false; + break; + } + if (!model.IsInitializedTensor(input)) { + if (model.IsShapeTensor(input)) { + // if it is a shape tensor we can have constant output if the shapes are defined) + auto shapeData = model.GetShapeTensorValues(input); + bool isShapeFullyDefined = ConvertShapeToInt(shapeData).size() == shapeData.size(); + if (!isShapeFullyDefined) { + fIsOutputConstant = false; + } else { + // if shape is fully defined we can consider output as constant and we can compute the output + // shape at initialization time + fIsOutputConstant = fIsOutputConstant && true; + } + // inputs are then shape tensors and output is a shape tensor + isOutputShape = true; + } else { + // case of standard intermediate tensor + fIsOutputConstant = false; + isOutputShape = false; + break; + } + } else { + fIsOutputConstant = fIsOutputConstant && true; + } + } + //} + + if (fIsOutputConstant) { + auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible + std::vector outputData(ConvertShapeToLength(outputShape)); + size_t offset = 0; + for (auto &input : fInputs) { + auto inputData = static_cast(model.GetInitializedTensorData(input).get()); + auto inputShape = model.GetTensorShape(input); // shape is not dynamic if it is constant + size_t inputLength = ConvertShapeToLength(inputShape); + std::copy(inputData, inputData + inputLength, outputData.begin() + offset); + offset += inputLength; + // the data of the input tensor don't need to be written in the generated code and data file + model.SetNotWritableInitializedTensor(input); + } + model.AddConstantTensor(fOutput, outputShape, outputData.data()); + if (model.Verbose()) { + std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : " + << ConvertValuesToString(outputData) << " (constant)" << std::endl; + } + } else if (isOutputShape) { + auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible + if (outputShape.size() != 1) + throw std::runtime_error("SOFIE Concat Op - output shape for shape tensor must have rank 1"); + // output shape is a rank 1 tensor with size equal to the output rank + std::vector outputData(outputShape[0]); + size_t offset = 0; + for (auto &input : fInputs) { + std::vector inputData; + auto inputShape = model.GetTensorShape(input); // shape is not dynamic + size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar + if (model.IsShapeTensor(input)) { + inputData = model.GetShapeTensorValues(input); + } else if (model.IsInitializedTensor(input)) { + inputData.resize(inputLength); + auto intData = static_cast(model.GetInitializedTensorData(input).get()); + for (size_t i = 0; i < inputData.size(); i++) + inputData[i] = Dim{static_cast(intData[i])}; + } else { + // this should not happen + throw std::runtime_error("SOFIE Concat Operator- invalid tensor input " + input + + " for shape output type"); + } + std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset); + offset += inputLength; + } + // add output tensor + model.AddShapeTensor(fOutput, outputData, false); // cannot be a scalar + fOutputShapeData = outputData; + if (model.Verbose()) { + std::cout << "output of Concat is a shape tensor " << ConvertShapeToString(outputShape) << " : " + << ConvertDimShapeToString(outputData) << " (shape)" << std::endl; + } + fIsOutputParamShape = true; + } + if (!fIsOutputConstant && !fIsOutputParamShape) { + fInputType = model.GetTensorType(fInputs[0]); + model.AddIntermediateTensor(fOutput, fInputType, fOutputShape); + if (model.Verbose()) { + std::cout << "Concat ---> " << fOutput << " " << ConvertDimShapeToString(fOutputShape) << std::endl; + } + } + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + std::stringstream out; + out<<"\n//--------- Concat " << opName << " --> " << fOutput << " " << ConvertDimShapeToString(fOutputShape) << "\n"; + + if (fIsOutputConstant) return out.str(); + + if (fIsOutputParamShape) { + // output is a shape tensor defined by the concatenation of the input shapes + out << "// output is a shape tensor defined by the concatenation of the input shapes\n"; + for (int i = 0; i < static_cast(fOutputShape + [0].dim); i++) { + out << SP << "tensor_" << fOutput << "[" << i << "] = " << fOutputShapeData[i] << ";\n"; + } + return out.str(); + } + // special case when memory is contiguous + bool hasShapeOnes = true; + for(int i = 0; i 0) + out << offset; + offset += " + " + length; + out << ", " << "tensor_" << fInputs[i] << ", " + length << ");\n"; + } + } + else { + + std::vector outStride = UTILITY::ComputeStrideFromShape(fOutputShape); + std::vector> inStrides(fInputs.size()); + int idx = 0; + for ( auto &s : inStrides) { + s = UTILITY::ComputeStrideFromShape(fInputShapes[idx]); + idx++; + } + for (int i = 0; i < fAxis; ++i) { + // loop on dimensions + out << SP << "for (size_t i" << i << " = 0; i" << i << " < " << fOutputShape[i].GetVal() << "; ++i" << i <<") {\n"; + } + + out << SP << SP << SP << "int idxOut = "; + for (int k = 0; k < fAxis; k++) { + if (k > 0) out << " + "; + out << outStride[k].GetVal() << "*i" << k; + } + out << ";\n"; + + for (size_t j = 0; j < fInputs.size(); j++) { + if (j>0) + out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n"; + out << SP << SP << SP << "int idxIn" << j <<" = "; + for (int k = 0; k < fAxis; k++) { + if (k > 0) out << " + "; + out << inStrides[j][k].GetVal() << "*i" << k; + } + out << ";\n"; + out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n"; + out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n"; + out << SP << SP << SP << "}\n"; + // concatenate the axis values + } + for (int i = 0; i < fAxis; ++i) { + out << SP << "}\n"; + } + } + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fOutputShape.empty()) + throw std::runtime_error("SOFIE Operator Concat called to Generate without being initialized first"); + + const std::size_t D = fOutputShape.size(); + const std::size_t Nin = fInputs.size(); + + auto outStrides = UTILITY::ComputeStrideFromShape(fOutputShape); + + std::vector prefix(Nin); + prefix[0] = 0; + for (std::size_t k = 1; k < Nin; ++k) + prefix[k] = prefix[k - 1] + std::stoul(fInputShapes[k - 1][fAxis].GetVal()); + + std::vector> inStrides(Nin); + for (std::size_t k = 0; k < Nin; ++k) + inStrides[k] = UTILITY::ComputeStrideFromShape(fInputShapes[k]); + + std::string op; + op = "\n//------ CONCAT_KERNEL_ALPAKA\n"; + op += SP + "struct ConcatKernel_" + opName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "std::array inputs,\n"; + op += SP + SP + SP + "T* output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "std::size_t remaining;\n"; + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "remaining = elem_idx;\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string stride_val = outStrides[d].GetVal(); + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = remaining / " + stride_val + "u;\n"; + op += SP + SP + SP + SP + "remaining -= out_" + std::to_string(d) + + " * " + stride_val + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t chosen = 0;\n"; + for (std::size_t k = 0; k < Nin; ++k) { + std::size_t end_k = prefix[k] + std::stoul(fInputShapes[k][fAxis].GetVal()); + op += SP + SP + SP + SP + "chosen += static_cast(" + + std::to_string(end_k) + "u <= out_" + std::to_string(fAxis) + ");\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const output_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + SP + "out_" + std::to_string(d) + + " * " + outStrides[d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t k = 0; k < Nin; ++k) { + op += SP + SP + SP + SP + SP + "(chosen == " + std::to_string(k) + "u) * (\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == static_cast(fAxis)) + ? ("(out_" + std::to_string(d) + " - " + std::to_string(prefix[k]) + "u)") + : ("out_" + std::to_string(d)); + op += SP + SP + SP + SP + SP + SP + coord + + " * " + inStrides[k][d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : "\n"; + } + op += SP + SP + SP + SP + SP + ")"; + op += (k + 1 < Nin) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[output_idx] = inputs[chosen][input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + return SP + "ConcatKernel_" + opName + " concatKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fOutputShape.empty()) { + throw std::runtime_error("SOFIE Operator Concat called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fOutputShape); + out << "\n//------ CONCAT_GPU_ALPAKA\n"; + switch (fInputType){ + case ETensorType::FLOAT: + out << SP << "std::array input_ptrs_" << OpName << " = {"; break; + case ETensorType::INT64: + out << SP << "std::array input_ptrs_" << OpName << " = {"; break; + default: + throw std::runtime_error("Data type for Concat operator is not yet supported."); + } + for(size_t i=0; i0) out << ", "; + out << "alpaka::getPtrNative(deviceBuf_" << fInputs[i] << ")"; + } + out << "};\n"; + + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << OpName + << ", concatKernel_" << OpName << ", input_ptrs_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fOutput << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + }; + }//SOFIE + + + #endif //SOFIE_ROPERATOR_CONCAT diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx b/core/inc/SOFIE/ROperator_Constant.hxx similarity index 84% rename from src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx rename to core/inc/SOFIE/ROperator_Constant.hxx index 0d08432..8640e96 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx +++ b/core/inc/SOFIE/ROperator_Constant.hxx @@ -52,7 +52,7 @@ public: // case of ConstantOfShape (since no inputs in case of Constant operator) fIsConstantOfShape = true; if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE ConstantOfShape Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE ConstantOfShape Op Input Tensor is not found in model"); } // get output shape from input values: // can work only if input is a constant or initialized tensor (or dynamic one) @@ -60,7 +60,7 @@ public: auto input_tensor = static_cast(dptr.get()); auto input_shape = model.GetTensorShape(fNX); if (input_shape.size() > 1 ) - throw std::runtime_error("TMVA SOFIE ConstantOfShape Op Input Tensor has invalid shape"); + throw std::runtime_error("SOFIE ConstantOfShape Op Input Tensor has invalid shape"); if (input_tensor != nullptr && !input_shape.empty()) { fShape = std::vector (input_shape[0]); for (size_t i = 0; i < fShape.size(); i++) @@ -70,7 +70,7 @@ public: length = ConvertShapeToLength(fShape); if (fValues.size() != 1) - throw std::runtime_error("TMVA SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size())); + throw std::runtime_error("SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size())); T value = fValues[0]; fValues = std::vector(length, value); @@ -80,7 +80,7 @@ public: // in case of standard constant the shape is provided as input length = ConvertShapeToLength(fShape); if (length != fValues.size()) - throw std::runtime_error("TMVA SOFIE Constant Op has invalid shape : " + ConvertShapeToString(fShape) + + throw std::runtime_error("SOFIE Constant Op has invalid shape : " + ConvertShapeToString(fShape) + " with " + std::to_string(fValues.size()) + " values"); } @@ -101,6 +101,11 @@ public: // no code to generate here. Tensor are defined in Session constructor return "//---------------------------------------\n"; } + + std::string Generate_GPU_ALPAKA(std::string /* OpName */) override { + // no code to generate here. Tensor are defined in Session constructor + return "//---------------------------------------\n"; + } }; }//SOFIE diff --git a/core/inc/SOFIE/ROperator_Conv.hxx b/core/inc/SOFIE/ROperator_Conv.hxx new file mode 100644 index 0000000..835a0ff --- /dev/null +++ b/core/inc/SOFIE/ROperator_Conv.hxx @@ -0,0 +1,999 @@ +#ifndef SOFIE_SOFIE_ROPERATOR_CONV +#define SOFIE_SOFIE_ROPERATOR_CONV + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include +#include +#include + + +namespace SOFIE { + +template +class ROperator_Conv final : public ROperator +{ +private: + bool fBroadcastBias = false; + + std::string fAttrAutopad; + std::vector fAttrDilations; + size_t fAttrGroup; + std::vector fAttrKernelShape; + std::vector fAttrPads; + std::vector fAttrStrides; + + std::string fNX; + std::string fNW; + std::string fNB; + std::string fNY; + + std::string convK; + std::string imcol; + + std::vector fShapeX; + std::vector fShapeW; + std::vector fShapeB; + std::vector fShapeY; + + std::string fType; + + size_t fDim; // dimension of the convolution + + +public: + + ROperator_Conv() {} + + ROperator_Conv(std::string autopad, std::vector dilations, + size_t group, std::vector kernelShape, std::vector pads, + std::vector strides, std::string nameX, std::string nameW, + std::string nameB, std::string nameY): + fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), + fAttrPads(pads), fAttrStrides(strides), + fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), + fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + if(std::is_same::value) { + fType = "float"; + } else { + throw + std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); + } + fInputTensorNames = { fNX, fNB }; + fOutputTensorNames = { fNY }; + fKind = OperatorKind::CONV; + } + + ROperator_Conv(std::string autopad, std::vector dilations, + size_t group, std::vector kernelShape, std::vector pads, + std::vector strides, std::string nameX, std::string nameW, + std::string nameY): + fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), + fAttrPads(pads), fAttrStrides(strides), + fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY)) + { + if(std::is_same::value) { + fType = "float"; + } else { + throw + std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); + } + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + fKind= OperatorKind::CONV; + } + + std::vector TypeInference(std::vector input) override { + ETensorType out = input[0]; + return {out}; + } + + // function returning output shape given input + std::vector DoShapeInference(const std::vector & input, const std::vector & weight) { + // shape of convolution input has to be (according to ONNX): N x C x H x W + // Where N : batch size, C : input channels, H : input height, W : input width + + if (input.size() -2 != fDim) { + throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid input "); + } + if (weight.size() -2 != fDim) { + throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid weights "); + } + if (fAttrGroup == 0 && input[1].isParam) + throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without group attr"); + if (fAttrKernelShape.empty()) { + if (input[2].isParam || (fDim > 1 && input[3].isParam) || (fDim > 2 && input[4].isParam)) + throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without kernel attr"); + } + + if (fAttrGroup == 0) { + fAttrGroup = input[1].dim / weight[1]; + } + + // kernel shape + size_t k1 = ((fAttrKernelShape.empty())? weight[2] : fAttrKernelShape[0]); + size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? weight[3] : fAttrKernelShape[1]) : 1; + size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? weight[4] : fAttrKernelShape[2]) : 1; + + + size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1; + size_t i2 = (fDim > 2) ? 4 : 3; + size_t i3 = 5; + + if (fAttrDilations.empty()) { + fAttrDilations = {1, 1, 1}; + } + fAttrDilations.resize(3); + if (fDim < 3) { + fAttrDilations.resize(3, 1); + } + // Shape of the kernel + fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1), + k2 + (fAttrDilations[1] - 1) * (k2 - 1), + k3 + (fAttrDilations[2] - 1) * (k3 - 1)}; + + if (fAttrStrides.empty()) { + fAttrStrides = {1, 1, 1}; + } + if (fDim < 3) + fAttrStrides.resize(3, 1); + + if (fAttrAutopad == "NOTSET") { + if (fAttrPads.empty()) { + fAttrPads = {1, 1, 1, 1, 1, 1}; + } + } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") { + for (size_t d = 0; d < fDim; ++d) { + if (input[d + 2].isParam) + throw std::runtime_error( + "TMVA SOFIE Conv Op: SAME padding with parametric input shape is not supported"); + } + // ONNX SAME padding: total_pad = max(0, (ceil(in/stride)-1)*stride + kernel - in) + // SAME_UPPER places extra padding at end, SAME_LOWER at beginning + fAttrPads.assign(6, 0); + for (size_t d = 0; d < fDim; ++d) { + size_t inSize = input[d + 2].dim; + size_t stride_d = fAttrStrides[d]; + size_t outSize = (inSize + stride_d - 1) / stride_d; + int totalPad = std::max(0, (int)((outSize - 1) * stride_d + fAttrKernelShape[d]) - (int)inSize); + if (fAttrAutopad == "SAME_UPPER") { + fAttrPads[d] = (size_t)(totalPad / 2); + fAttrPads[d + fDim] = (size_t)(totalPad - totalPad / 2); + } else { + fAttrPads[d] = (size_t)(totalPad - totalPad / 2); + fAttrPads[d + fDim] = (size_t)(totalPad / 2); + } + } + } else if (fAttrAutopad != "VALID") { + throw + std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad"); + } + // to be sure pad is vector of size 6 + if (fDim < 3) fAttrPads.resize(6, 0); + + Dim input1 = input[2]; + Dim input2 = (fDim > 1) ? input[3] : Dim{1}; + Dim input3 = (fDim > 2) ? input[4] : Dim{1}; + + size_t pad1 = fAttrPads[0] + fAttrPads[i1]; + + // function to get output dimension of convolution given input + + auto computeOutput = [&](Dim inputDim, size_t kernel, size_t pad, size_t stride) { + if (!inputDim.isParam) { + size_t outSize = (inputDim.dim + pad - kernel) / stride + 1; + return Dim{outSize}; + } else { + if (stride == 1){ + if ((pad - kernel + 1) == 0 ) + // output is same as input + return inputDim; + else { + int64_t v = pad - kernel + 1; + std::string outStr = "(" + inputDim.param + "+" + std::to_string(v) + ")"; + return Dim{ outStr, static_cast(-1)}; + } + } else { // general case (stride not 1) + int64_t v = pad - kernel; + std::string outStr = "((" + inputDim.param + "+" + std::to_string(v) + ")/" + + std::to_string(stride) + "1)"; + return Dim{ outStr, static_cast(-1)}; + } + } + throw std::runtime_error("TMVA SOFIE Conv Op - invalid values"); + return Dim{}; + }; + + Dim output1 = computeOutput(input1, fAttrKernelShape[0], pad1, fAttrStrides[0]); + + Dim batch_size = input[0]; // first element in input tensor + Dim output_channels = Dim{weight[0]}; // first element in weight tensor + + std::vector ret({ batch_size, output_channels, output1 }); + + if (fDim == 1) + return ret; + + size_t pad2 = fAttrPads[1] + fAttrPads[i2]; + Dim output2 = computeOutput(input2, fAttrKernelShape[1], pad2, fAttrStrides[1]); + + // output is N x M x OH x OW + ret.push_back(output2); + if (fDim == 2) + return ret; + + size_t pad3 = fAttrPads[2] + fAttrPads[i3]; + Dim output3 = computeOutput(input3, fAttrKernelShape[2], pad3, fAttrStrides[2]); + + // output is N x M x OH x OW x OD + ret.push_back(output3); + return ret; + } + + void Initialize(RModel& model) override { + fUseSession = model.UseSession(); + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw + std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model"); + } + fShapeX = model.GetDimTensorShape(fNX); + if (fShapeX.size() < 3 || fShapeX.size() > 5) { + std::cout << fNX << " : " << ConvertDimShapeToString(fShapeX) << std::endl; + throw + std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); + } + fDim = fShapeX.size() - 2; + if (!model.CheckIfTensorAlreadyExist(fNW)) { + throw + std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); + } + fShapeW = model.GetTensorShape(fNW); + if (fShapeW.size() < 3 || fShapeW.size() > 5) { + std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl; + throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions"); + } + fShapeY = DoShapeInference(fShapeX, fShapeW); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + if (fNB != "") { + if (!model.CheckIfTensorAlreadyExist(fNB)) { + throw + std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model"); + } + fShapeB = model.GetTensorShape(fNB); + if (fShapeB.size() != 1) + throw + std::runtime_error("TMVA SOFIE Conv op : invalid shape for Bias tensor (is not 1D)"); + std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); + auto shapeDimB = model.GetDimTensorShape(fNB); + bool broadcast_needed = !UTILITY::AreSameShape(shapeDimB, targetShape); + if (broadcast_needed) { + auto original_data = model.GetInitializedTensorData(fNB); + // make bias shape equal to Y shape by adding 1 + if (fShapeB.size() < 1) + throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape"); + // we assume bias tensor dimension is equal to number of filters that is the second dimension in + // the output tensor + if (!(shapeDimB[0] == fShapeY[1])) + throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " + + ConvertShapeToString(fShapeB)); + if (fType != "float") + throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported"); + // here is the actual broadcasting + fBroadcastBias = true; + if (!fUseSession) { + // do here broadcasting + std::vector shape(fDim + 1, 1); + shape[0] = fShapeB[0]; + auto intTargetShape = ConvertShapeToInt(targetShape); + std::shared_ptr new_data_ptr( + UTILITY::UnidirectionalBroadcast(static_cast(original_data.get()), shape, intTargetShape), + std::default_delete()); + model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr); + fShapeB = model.GetTensorShape(fNB); + } + } + } + // output channel size can be parametric and is an expression + std::vector outputDims = std::vector(fShapeY.begin()+2, fShapeY.end()); + //check if shape is not parametric + std::vector outputInts = ConvertShapeToInt(outputDims); + Dim channelDim; + if (outputInts.empty()) { + auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W + channelDim = Dim{ outputChannelSize, static_cast(-1)}; + } else { + size_t outputChannelSize = ConvertShapeToLength(outputInts); + channelDim = Dim{ outputChannelSize }; + } + size_t kernelSize = fAttrKernelShape[0]; + for (size_t i = 1; i < fDim; i++) { + kernelSize *= fAttrKernelShape[i]; + } + + std::vector shape1 = {fShapeW[0], fShapeW[1], kernelSize}; + std::vector shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, channelDim }; + model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 ); + model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 ); + convK = fNX +"_f"; + imcol = fNX +"_xcol"; + fOutputTensorNames.emplace_back(convK); + fOutputTensorNames.emplace_back(imcol); + fInputTensorNames.emplace_back(convK); + fInputTensorNames.emplace_back(imcol); + + if (model.Verbose()) { + std::cout << "Conv - " << fDim << " " << fNX << " : " << ConvertDimShapeToString(fShapeX) + << " --> " << fNY << " : " << ConvertDimShapeToString(fShapeY) << std::endl; + } + } + + std::string GenerateInitCode() override { + std::stringstream out; + // Generate initialization code for broadcasting of bias tensor + if (fBroadcastBias) { + // include a separate scope to avoid defining unique operator temp variables + std::vector shape(fDim + 1, 1); + // bias (is a 1D tensor) + shape[0] = fShapeB[0]; + std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); + out << "//--- broadcast bias tensor " << fNB << "for Conv op if needed \n"; + // in case of dynamic tensors check needs to be done at run time + bool isOutDynamic = ConvertShapeToInt(targetShape).empty(); + auto length = ConvertDimShapeToLength(targetShape); + if (isOutDynamic) + out << SP << "if (" << length << " > " << ConvertShapeToLength(shape) << ") {\n"; + else + out << SP << "{\n"; + out << SP << SP << "float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" + << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertDimShapeToString(fShapeY) << ");\n"; + out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n"; + out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNB << ".begin());\n"; + out << SP << SP << "tensor_" << fNB << " = fTensor_" << fNB << ".data();\n"; + out << SP << SP << "delete[] data;\n"; + out << SP << "}\n"; + } + return out.str(); + } + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + + if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) { + throw + std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); + } + + std::stringstream out; + auto bsize = fShapeX[0]; + size_t kDepth = (fDim > 2) ? fShapeW[2] : 1; // kernel depth + size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; // kernel height + size_t kWidth = fShapeW[fDim+1]; // kernel width + auto iDepth = (fDim > 2) ? fShapeX[2] : Dim{1}; // input depth + auto iHeight = (fDim > 1) ? fShapeX[fDim] : Dim{1}; // input height + auto iWidth = fShapeX[fDim+1]; // input width + auto oDepth = (fDim > 2) ? fShapeY[2] : Dim{1}; // output depth + auto oHeight = (fDim > 1) ? fShapeY[fDim] : Dim{1}; // ouput height + auto oWidth = fShapeY[fDim+1]; // output width + // total output size for a channel + auto outputChannelStride = ConvertDimShapeToLength(std::vector{oDepth, oHeight, oWidth}); // size of channel = D * H * W + auto outputBatchStride = ConvertDimShapeToLength(std::vector{fShapeY[1] , oDepth, oHeight, oWidth}); // size of C * D * H * W + // input size + auto inputChannelStride = ConvertDimShapeToLength(std::vector{iDepth, iHeight, iWidth}); + auto inputBatchStride = ConvertDimShapeToLength(std::vector{fShapeX[1] , iDepth, iHeight, iWidth}); // size of C * D * H * W + + out << "\n//---- operator Conv " << OpName << "\n"; + + // vectorize the (dilated)convolution kernels into a matrix + // no need to transpose the matrix + // to fix for 1d and 3d + + size_t id = (fDim > 2) ? fDim-3 : 2; + size_t ih = (fDim > 1) ? fDim-2 : 1; + size_t iw = fDim-1; + + size_t wstrideDil = fAttrDilations[iw]; + size_t hstride = kWidth; + size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw]; // stride dilated in the height + size_t dstride = kHeight * kWidth; + size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t icstride = kHeight * kWidth * kDepth; + size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t ocstride = fShapeW[1] * icstride; + size_t ocstrideDil = fShapeW[1] * icstrideDil; + + out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n"; + out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n"; + if (fDim > 2) + out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n"; + if (fDim > 1) + out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n"; + out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n"; + + out << SP << SP << SP << SP << SP << "tensor_" < 2) out << " + kd * " << dstrideDil; + if (fDim > 1) out << " + kh * " << hstrideDil; + out << " + kw * " << wstrideDil << " ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride; + if (fDim > 2) out << " + kd * " << dstride; + if (fDim > 1) out << " + kh * " << hstride; + out << " + kw ];\n"; + + out << SP << SP << SP << SP << "}\n"; + if (fDim > 1) out << SP << SP << SP << "}\n"; + if (fDim > 2) out << SP << SP << SP << "}\n"; + out << SP << SP << "}\n"; + out << SP << "}\n"; + + //out << SP << "char " << OpName << "_transA = 'T';\n"; + out << SP << "char " << OpName << "_transA = 'N';\n"; + out << SP << "char " << OpName << "_transB = 'N';\n"; + out << SP << "int " << OpName << "_m = " << outputChannelStride << ";\n"; // output h*w + assert(fShapeY[1] == fShapeW[0]); + //assert(fShapeW[1] == fShapeX[1] / fAttrGroup); + out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels + out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n"; + out << SP << "float " << OpName << "_alpha = 1.0;\n"; + if (fNB != "") + out << SP << "float " << OpName << "_beta = 1.0;\n"; + else // when bias is not present beta needs to be equal to zero to avoid re-using previous results in output tensor + out << SP << "float " << OpName << "_beta = 0.0;\n"; + + + // Loop on batch size + out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n"; + + // IM2COL: Unroll the input tensor + // order input data as (e.g. kernel 2x2) and (xa,ya) is channel 1 and (xb,yb) is channel 2 + // (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk) + // (xa2,...xak+1,ya1,...yak)(......) + // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows. + // By doing this one has consecutive memory reads and writes + // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w) + if (fDim ==1) { + if (fAttrPads[0] != fAttrPads[1] ) { + std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " + << std::endl; + fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2; + } + fAttrPads[1] = 0; + fAttrStrides[1] = 1; + } + if (fDim == 2) { + if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) { + std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; + fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2; + fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2; + } + } + if (fDim == 3) { + if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) { + std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; + fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2; + fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2; + fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2; + } + } + out << SP << SP << "size_t out_offset = n * " << outputBatchStride << ";\n"; + + if (fAttrGroup == 1) { + out << SP << SP << "size_t x_offset = n * " << inputBatchStride << ";\n"; + // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y, output_h * + // output_w) + if (fDim < 3) { + out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX + << " + x_offset," + // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, + // dilation_w, + // + << fShapeW[1] << "," << iHeight << "," << iWidth << ","; + if (fDim == 1) + out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," + << fAttrDilations[0]; + else // dim ==2 + out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] + << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," + << fAttrDilations[1]; + out << "," << "tensor_" <(tensor_" << fNX + << " + x_offset," + // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, + // dilation_d, dilation_h, dilation_w, + // + << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," + << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," + << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << "," + << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << "," + << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << "," + << "tensor_" << fNX << "_xcol);\n\n "; + } + // BLAS + out << SP << "SOFIE::Gemm_Call(" + << "tensor_" << fNY << " + out_offset, false, false, " + << OpName << "_m, " << OpName << "_n, " << OpName << "_k, " + << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, tensor_" << fNX << "_f, " + << OpName << "_beta, "; + if (fNB != "") + out << "tensor_" << fNB; + else + out << "nullptr"; + out << ");\n"; + + + // out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &" + // << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName + // << "_m,\n"; // use m if op_xcol is not transpose , otherwise k + // out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY + // << " + out_offset, &" << OpName << "_m);\n"; + } else { + // case of group convolution + // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each + // group) + // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; + out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n"; + out << SP << SP << "size_t x_offset = n * " << inputBatchStride << " + g * " + << fShapeW[1] << " * " << inputChannelStride << ";\n "; + out << SP << SP << "size_t g_offset = g * " << fShapeW[0] << " * (" << outputChannelStride << ") / " << fAttrGroup << ";\n "; + out << SP << SP << "size_t out_offset = n * " << outputBatchStride << " + g_offset;\n"; + + if (fDim < 3) { + out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX + << " + x_offset," + // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, + // dilation_w, + // + << fShapeW[1] << "," << iHeight << "," << iWidth << ","; + if (fDim == 1) + out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," + << fAttrDilations[0]; + else // dim ==2 + out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] + << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," + << fAttrDilations[1]; + out << ", tensor_" << fNX << "_xcol);\n\n "; + } else { + // 3d im2col + out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX + << " + x_offset," + // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, + // dilation_d, dilation_h, dilation_w, + // + << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << "," + << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] + << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] + << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX + << "_xcol);\n\n "; + } + + // BLAS + // n must be divided by the number of groups + out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n"; + // offset g must be g * k * n + out << SP << SP << SP << "size_t offset_f = g * " + << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup + << ";\n"; + + out << SP << "SOFIE::Gemm_Call(" + << "tensor_" << fNY << " + out_offset, false, false, " + << OpName << "_m, " << OpName << "_n, " << OpName << "_k, " + << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, tensor_" << fNX << "_f + offset_f, " + << OpName << "_beta, "; + if (fNB != "") + out << "tensor_" << fNB << " + g_offset"; + else + out << "nullptr"; + out << ");\n"; + out << SP << SP << "}\n"; // end of group loop + } + out << SP << "}\n"; // end of batch size loop + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty() || fShapeW.empty() || fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); + + size_t oDepth = (fDim > 2) ? fShapeY[2].dim : 1; + size_t oHeight = (fDim > 1) ? fShapeY[fDim].dim : 1; + size_t oWidth = fShapeY[fDim + 1].dim; + size_t iDepth = (fDim > 2) ? fShapeX[2].dim : 1; + size_t iHeight = (fDim > 1) ? fShapeX[fDim].dim : 1; + size_t iWidth = fShapeX[fDim + 1].dim; + size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; + size_t kWidth = fShapeW[fDim + 1]; + size_t kDepth = (fDim > 2) ? fShapeW[2] : 1; + + size_t kernelSize = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2]; + size_t colRows = fShapeW[1] * kernelSize; + size_t colCols = oDepth * oHeight * oWidth; + size_t colElements = colRows * colCols; + size_t outChannels = fShapeW[0]; + size_t spatialSize = oDepth * oHeight * oWidth; + + // Strides for weight vectorisation + size_t id = (fDim > 2) ? fDim - 3 : 2; + size_t ih = (fDim > 1) ? fDim - 2 : 1; + size_t iw = fDim - 1; + size_t wstrideDil = fAttrDilations[iw]; + size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw]; + size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t ocstrideDil = fShapeW[1] * icstrideDil; + size_t hstride = kWidth; + size_t dstride = kHeight * kWidth; + size_t icstride = kHeight * kWidth * kDepth; + size_t ocstride = fShapeW[1] * icstride; + size_t wTotalElements = ConvertShapeToLength(fShapeW); + + std::string op; + + // Kernel 1: Weight vectorisation — reorder W into _f with dilation layout + // Each thread handles one output element of _f + std::string wKname = "WeightVecKernel_" + opName; + op = "\n//------ WEIGHT_VEC_KERNEL_ALPAKA (Conv " + opName + ")\n"; + op += SP + "struct " + wKname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ W,\n"; + op += SP + SP + SP + "T* __restrict__ f,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose elem_idx into (oc, ic, kd, kh, kw) using compile-time strides + op += SP + SP + SP + SP + "std::size_t const oc = elem_idx / " + std::to_string(ocstride) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const oc_rem = elem_idx % " + std::to_string(ocstride) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ic = oc_rem / " + std::to_string(icstride) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ic_rem = oc_rem % " + std::to_string(icstride) + "u;\n"; + if (fDim > 2) { + op += SP + SP + SP + SP + "std::size_t const kd = ic_rem / " + std::to_string(kHeight * kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = (ic_rem / " + std::to_string(kWidth) + "u) % " + std::to_string(kHeight) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = ic_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else if (fDim > 1) { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = ic_rem / " + std::to_string(kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = ic_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = ic_rem;\n\n"; + } + + // Compute destination index in _f (dilated layout) + op += SP + SP + SP + SP + "std::size_t const f_idx =\n"; + op += SP + SP + SP + SP + SP + "oc * " + std::to_string(ocstrideDil) + "u +\n"; + op += SP + SP + SP + SP + SP + "ic * " + std::to_string(icstrideDil) + "u"; + if (fDim > 2) op += " +\n" + SP + SP + SP + SP + SP + "kd * " + std::to_string(dstrideDil) + "u"; + if (fDim > 1) op += " +\n" + SP + SP + SP + SP + SP + "kh * " + std::to_string(hstrideDil) + "u"; + op += " +\n" + SP + SP + SP + SP + SP + "kw * " + std::to_string(wstrideDil) + "u;\n\n"; + + op += SP + SP + SP + SP + "f[f_idx] = W[elem_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + + // Kernel 2: Im2Col + std::string im2colKname = "Im2ColKernel_" + opName; + op += SP + "//------ IM2COL_KERNEL_ALPAKA (Conv " + opName + ")\n"; + op += SP + "struct " + im2colKname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ col,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "std::size_t const col_row = elem_idx / " + std::to_string(colCols) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const col_col = elem_idx % " + std::to_string(colCols) + "u;\n\n"; + + op += SP + SP + SP + SP + "std::size_t const ic = col_row / " + std::to_string(kernelSize) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const k_rem = col_row % " + std::to_string(kernelSize) + "u;\n"; + if (fDim > 2) { + op += SP + SP + SP + SP + "std::size_t const kd = k_rem / " + std::to_string(kHeight * kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = (k_rem / " + std::to_string(kWidth) + "u) % " + std::to_string(kHeight) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else if (fDim > 1) { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = k_rem / " + std::to_string(kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem;\n\n"; + } + + if (fDim > 2) { + op += SP + SP + SP + SP + "std::size_t const od = col_col / " + std::to_string(oHeight * oWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const oh = (col_col / " + std::to_string(oWidth) + "u) % " + std::to_string(oHeight) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ow = col_col % " + std::to_string(oWidth) + "u;\n\n"; + } else if (fDim > 1) { + op += SP + SP + SP + SP + "std::size_t const od = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const oh = col_col / " + std::to_string(oWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ow = col_col % " + std::to_string(oWidth) + "u;\n\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const od = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const oh = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const ow = col_col;\n\n"; + } + + // Depth: trivially 0 for fDim < 3 (od=kd=0 always); pads[0] is height-begin for 2D, so + // applying it here would make id_in negative and zero the whole output. + if (fDim >= 3) { + op += SP + SP + SP + SP + "int64_t const id_in = static_cast(od * " + std::to_string(fAttrStrides[0]) + + "u + kd * " + std::to_string(fAttrDilations[0]) + "u) - " + std::to_string(fAttrPads[0]) + ";\n"; + } else { + op += SP + SP + SP + SP + "int64_t const id_in = 0;\n"; + } + // Height: for fDim==3 the height dim is at strides/pads index 1; for fDim==2 it is at index 0. + // For fDim==1 oh=kh=0 so ih_in=0. + { + size_t const hIdx = (fDim > 2) ? 1 : 0; + if (fDim >= 2) { + op += SP + SP + SP + SP + "int64_t const ih_in = static_cast(oh * " + std::to_string(fAttrStrides[hIdx]) + + "u + kh * " + std::to_string(fAttrDilations[hIdx]) + "u) - " + std::to_string(fAttrPads[hIdx]) + ";\n"; + } else { + op += SP + SP + SP + SP + "int64_t const ih_in = 0;\n"; + } + } + // Width: fAttrStrides/Dilations/Pads are ordered [d,h,w] so width is at index fDim-1. + { + size_t const wIdx = fDim - 1; + op += SP + SP + SP + SP + "int64_t const iw_in = static_cast(ow * " + std::to_string(fAttrStrides[wIdx]) + + "u + kw * " + std::to_string(fAttrDilations[wIdx]) + "u) - " + std::to_string(fAttrPads[wIdx]) + ";\n\n"; + } + + op += SP + SP + SP + SP + "bool const in_bounds =\n"; + op += SP + SP + SP + SP + SP + "id_in >= 0 && id_in < " + std::to_string(iDepth) + " &&\n"; + op += SP + SP + SP + SP + SP + "ih_in >= 0 && ih_in < " + std::to_string(iHeight) + " &&\n"; + op += SP + SP + SP + SP + SP + "iw_in >= 0 && iw_in < " + std::to_string(iWidth) + ";\n\n"; + + op += SP + SP + SP + SP + "if (in_bounds) {\n"; + op += SP + SP + SP + SP + SP + "std::size_t const in_idx =\n"; + op += SP + SP + SP + SP + SP + SP + "ic * " + std::to_string(iDepth * iHeight * iWidth) + "u +\n"; + op += SP + SP + SP + SP + SP + SP + "static_cast(id_in) * " + std::to_string(iHeight * iWidth) + "u +\n"; + op += SP + SP + SP + SP + SP + SP + "static_cast(ih_in) * " + std::to_string(iWidth) + "u +\n"; + op += SP + SP + SP + SP + SP + SP + "static_cast(iw_in);\n"; + op += SP + SP + SP + SP + SP + "col[elem_idx] = input[in_idx];\n"; + op += SP + SP + SP + SP + "} else {\n"; + op += SP + SP + SP + SP + SP + "col[elem_idx] = static_cast(0);\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + + // Kernel 3: Bias broadcast (only if bias present) + if (!fNB.empty()) { + std::string biasKname = "BiasBroadcastKernel_" + opName; + op += SP + "//------ BIAS_BROADCAST_KERNEL_ALPAKA (Conv " + opName + ")\n"; + op += SP + "struct " + biasKname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ bias,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n"; + op += SP + SP + SP + SP + "std::size_t const channel = elem_idx / " + std::to_string(spatialSize) + "u;\n"; + op += SP + SP + SP + SP + "output[elem_idx] = bias[channel];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + } + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string op; + op = SP + "WeightVecKernel_" + opName + " weightVecKernel_" + opName + ";\n"; + op += SP + "Im2ColKernel_" + opName + " im2colKernel_" + opName + ";\n"; + if (!fNB.empty()) + op += SP + "BiasBroadcastKernel_" + opName + " biasBroadcastKernel_" + opName + ";\n"; + return op; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty() || fShapeW.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Conv Op called to Generate without being initialized first"); + + size_t bsize = fShapeX[0].dim; + size_t oDepth = (fDim > 2) ? fShapeY[2].dim : 1; + size_t oHeight = (fDim > 1) ? fShapeY[fDim].dim : 1; + size_t oWidth = fShapeY[fDim + 1].dim; + size_t iDepth = (fDim > 2) ? fShapeX[2].dim : 1; + size_t iHeight = (fDim > 1) ? fShapeX[fDim].dim : 1; + size_t iWidth = fShapeX[fDim + 1].dim; + size_t outChannels = fShapeW[0]; + size_t kernelSize = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2]; + // gemm dimensions computed from shape members + size_t gemm_n = outChannels; // output channels + size_t gemm_k = fShapeW[1] * kernelSize; // input channels/group * kernel volume + size_t gemm_m = oDepth * oHeight * oWidth; // output spatial size per channel + size_t colElements = gemm_k * gemm_m; // colRows * colCols + size_t wTotal = ConvertShapeToLength(fShapeW); + + // For group conv: per-group output channels and _f offset + // gemm_n stays as total output channels — we divide per group at launch + size_t groupFOffset = gemm_n * gemm_k; // elements of _f per group + + std::stringstream out; + out << "\n//------ CONV_GPU_ALPAKA\n"; + + // ----------------------------------------------------------------------- + // Step 1: Weight vectorisation kernel — runs once, fully on GPU + // ----------------------------------------------------------------------- + out << SP << "// Step 1: vectorise W -> _f on GPU (once per infer call)\n"; + out << SP << "{\n"; + out << SP << SP << "auto const elementsPerThread_wv = Vec::all(static_cast(1));\n"; + out << SP << SP << "auto const elementsPerGrid_wv = Vec::all(Idx{" << wTotal << "});\n"; + out << SP << SP << "auto const workDiv_wv = sofie_workdiv(elementsPerGrid_wv);\n"; + out << SP << SP << "alpaka::exec(queue, workDiv_wv, weightVecKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNW << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" + << ", static_cast(" << wTotal << "));\n"; + out << SP << SP << "alpaka::wait(queue);\n"; + out << SP << "}\n\n"; + + // ----------------------------------------------------------------------- + // Step 2: Batch loop + // ----------------------------------------------------------------------- + out << SP << "for (std::size_t n = 0; n < " << bsize << "; n++) {\n\n"; + out << SP << SP << "std::size_t const x_offset = n * " + << fShapeX[1].dim * iDepth * iHeight * iWidth << "u;\n"; + out << SP << SP << "std::size_t const out_offset = n * " + << fShapeY[1].dim * gemm_m << "u;\n\n"; + + // ----------------------------------------------------------------------- + // Step 3 + 4: Im2Col then GEMM — structure differs for grouped vs non-grouped + // ----------------------------------------------------------------------- + if (fAttrGroup == 1) { + // Non-grouped: single im2col per batch, then GEMM + out << SP << SP << "// Step 3: im2col\n"; + out << SP << SP << "{\n"; + out << SP << SP << SP << "auto const elementsPerThread_im2col = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << "auto const elementsPerGrid_im2col = Vec::all(Idx{" << colElements << "});\n"; + out << SP << SP << SP << "auto const workDiv_im2col = sofie_workdiv(elementsPerGrid_im2col);\n"; + out << SP << SP << SP << "alpaka::exec(queue, workDiv_im2col, im2colKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ") + x_offset" + << ", alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", static_cast(" << colElements << "));\n"; + out << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << "}\n\n"; + + if (!fNB.empty()) { + size_t biasElements = gemm_n * gemm_m; + out << SP << SP << "// Step 4a: broadcast bias into output slice\n"; + out << SP << SP << "{\n"; + out << SP << SP << SP << "auto const elementsPerThread_bias = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << "auto const elementsPerGrid_bias = Vec::all(Idx{" << biasElements << "});\n"; + out << SP << SP << SP << "auto const workDiv_bias = sofie_workdiv(elementsPerGrid_bias);\n"; + out << SP << SP << SP << "alpaka::exec(queue, workDiv_bias, biasBroadcastKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset" + << ", static_cast(" << biasElements << "));\n"; + out << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << "}\n\n"; + out << SP << SP << "// Step 4b: GEMM beta=1 accumulates onto bias-initialised output\n"; + out << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n"; + } else { + out << SP << SP << "// Step 4: GEMM beta=0 (no bias)\n"; + out << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" + << ", 0.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n"; + } + // Wait for GEMM to finish before next batch overwrites the shared _xcol buffer. + out << SP << SP << "alpaka::wait(queue);\n\n"; + + } else { + // Grouped convolution: im2col and GEMM per group with group-adjusted input pointer. + // Each group processes fShapeW[1] input channels starting at g * fShapeW[1]. + out << SP << SP << "for (std::size_t g = 0; g < " << fAttrGroup << "; g++) {\n\n"; + out << SP << SP << SP << "std::size_t const g_in_offset = x_offset + g * " + << fShapeW[1] * iDepth * iHeight * iWidth << "u;\n"; + out << SP << SP << SP << "std::size_t const g_out_offset = out_offset + g * " + << gemm_n * gemm_m << "u;\n"; + out << SP << SP << SP << "std::size_t const f_offset = g * " << groupFOffset << "u;\n\n"; + + out << SP << SP << SP << "// im2col for group g (reads only this group's input channels)\n"; + out << SP << SP << SP << "{\n"; + out << SP << SP << SP << SP << "auto const elementsPerThread_im2col = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << SP << "auto const elementsPerGrid_im2col = Vec::all(Idx{" << colElements << "});\n"; + out << SP << SP << SP << SP << "auto const workDiv_im2col = sofie_workdiv(elementsPerGrid_im2col);\n"; + out << SP << SP << SP << SP << "alpaka::exec(queue, workDiv_im2col, im2colKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ") + g_in_offset" + << ", alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", static_cast(" << colElements << "));\n"; + out << SP << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << SP << "}\n\n"; + + if (!fNB.empty()) { + size_t groupBiasElements = gemm_n * gemm_m; + out << SP << SP << SP << "// Broadcast group bias\n"; + out << SP << SP << SP << "{\n"; + out << SP << SP << SP << SP << "auto const elementsPerThread_bias = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << SP << "auto const elementsPerGrid_bias = Vec::all(Idx{" << groupBiasElements << "});\n"; + out << SP << SP << SP << SP << "auto const workDiv_bias = sofie_workdiv(elementsPerGrid_bias);\n"; + out << SP << SP << SP << SP << "alpaka::exec(queue, workDiv_bias, biasBroadcastKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNB << ") + g * " << gemm_n + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset" + << ", static_cast(" << groupBiasElements << "));\n"; + out << SP << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << SP << "}\n\n"; + out << SP << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ") + f_offset" + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset);\n\n"; + } else { + out << SP << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ") + f_offset" + << ", 0.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset);\n\n"; + } + // Wait for GEMM to finish before next group's im2col overwrites the shared _xcol buffer. + out << SP << SP << SP << "alpaka::wait(queue);\n\n"; + out << SP << SP << "}\n"; // end group loop + } + + out << SP << "}\n"; // end batch loop + return out.str(); + } + + /*! \brief Returns the blas routines needed to compile the generated code + */ + std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } + + + std::string GetBlasConfig(){ + size_t oDepth_ = (fDim > 2) ? fShapeY[2].dim : 1; + size_t oHeight_ = (fDim > 1) ? fShapeY[fDim].dim : 1; + size_t oWidth_ = fShapeY[fDim + 1].dim; + size_t kSize_ = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2]; + size_t gemm_n_ = fShapeW[0]; + size_t gemm_k_ = fShapeW[1] * kSize_; + size_t gemm_m_ = oDepth_ * oHeight_ * oWidth_; + auto lda = std::to_string(gemm_m_); // ld for xcol^T (gemm_m×gemm_k col-major) + auto ldb = std::to_string(gemm_k_); // ld for xf^T (gemm_k×gemm_n col-major) + auto ldc = std::to_string(gemm_m_); // ld for y^T (gemm_m×gemm_n col-major) + return std::to_string(gemm_m_) + ", " + std::to_string(gemm_n_) + ", " + std::to_string(gemm_k_) + ", " + lda + ", " + ldb + ", " + ldc + ", 'n', 'n'"; + } + +}; + +} // namespace SOFIE + +#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx b/core/inc/SOFIE/ROperator_ConvTranspose.hxx similarity index 95% rename from src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx rename to core/inc/SOFIE/ROperator_ConvTranspose.hxx index 0467385..5a4acf3 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx +++ b/core/inc/SOFIE/ROperator_ConvTranspose.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX #define SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX -#include -#include -#include +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include #include @@ -88,7 +88,7 @@ public: if (std::is_same::value) { fType = "float"; } else { - throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); + throw std::runtime_error("SOFIE Encountered unsupported type parsing a Conv operator"); } } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc b/core/inc/SOFIE/ROperator_ConvTranspose.icc similarity index 93% rename from src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc rename to core/inc/SOFIE/ROperator_ConvTranspose.icc index 3a52796..52b6b3e 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc +++ b/core/inc/SOFIE/ROperator_ConvTranspose.icc @@ -105,22 +105,22 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ fUseSession = model.UseSession(); if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model"); + throw std::runtime_error("SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model"); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() < 3 || fShapeX.size() > 5) { std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl; - throw std::runtime_error("TMVA SOFIE Conv Transpose Op input data tensor" + fNX + + throw std::runtime_error("SOFIE Conv Transpose Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); } fDim = fShapeX.size() - 2; if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); + throw std::runtime_error("SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() < 3 || fShapeW.size() > 5) { std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl; - throw std::runtime_error("TMVA SOFIE Conv Transpose Op input weight tensor" + fNW + + throw std::runtime_error("SOFIE Conv Transpose Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions"); } fShapeY = ShapeInference({fShapeX, fShapeW})[0]; @@ -128,11 +128,11 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); if (fNB != "") { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model"); + throw std::runtime_error("SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model"); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() < 1) - throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has empty shape"); + throw std::runtime_error("SOFIE ConvTrans op: Bias Tensor has empty shape"); size_t bsize = ConvertShapeToLength(fShapeB); size_t ysize = ConvertShapeToLength(fShapeY); @@ -143,13 +143,13 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ // we assume bias tensor size is equal to number of filters that is the second dimension in // the output tensor if (bsize != fShapeY[1] ) - throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has wrong shape: " + + throw std::runtime_error("SOFIE ConvTrans op: Bias Tensor has wrong shape: " + ConvertShapeToString(fShapeB)); auto original_data = model.GetInitializedTensorData(fNB); if (fType != "float") - throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported"); + throw std::runtime_error("SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported"); // here the acual broadcasting if (!fUseSession) { // Broadcast B from M to N x M x Od x Oh x Ow @@ -170,7 +170,7 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ else { // bias tensor is already correct shape, no need to broadcast if (fShapeY != fShapeB) - throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" + + throw std::runtime_error("SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" + ConvertShapeToString(fShapeB)); fNBroadcastedB = fNB; } @@ -218,7 +218,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) OpName = "op_" + OpName; if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Conv Op called to Generate without being initialized first"); } std::stringstream out; @@ -331,7 +331,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) // Resulting matrix op_xcol is (output channels * filter_h * filter_w , output_h * output_w) if (fDim == 1) { if (fAttrPads[0] != fAttrPads[1]) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " + std::cout << "SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2; } @@ -339,7 +339,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) } if (fDim == 2) { if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) { - std::cout << "TMVA SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " + std::cout << "SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " << std::endl; fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2; fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2; @@ -347,7 +347,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) } if (fDim == 3) { if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) { - std::cout << "TMVA SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " + std::cout << "SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " << std::endl; fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2; fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2; @@ -385,7 +385,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) out << ", tensor_" << fNY << " + out_offset);\n\n "; } else { // 3d : needs a col2im for 3d - throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported"); + throw std::runtime_error("SOFIE 3D Conv Transpose not yet supported"); out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX << " + x_offset," // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, @@ -436,7 +436,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) out << ", tensor_" << fNY << " + out_offset);\n\n "; } else { // 3d im2col - throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported"); + throw std::runtime_error("SOFIE 3D Conv Transpose not yet supported"); out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX << " + x_offset," diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx b/core/inc/SOFIE/ROperator_Custom.hxx similarity index 92% rename from src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx rename to core/inc/SOFIE/ROperator_Custom.hxx index c24d329..fb618d4 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx +++ b/core/inc/SOFIE/ROperator_Custom.hxx @@ -48,13 +48,13 @@ public: for(auto& it:fInputNames){ if (model.CheckIfTensorAlreadyExist(it) == false){ - throw std::runtime_error("TMVA SOFIE Custom " + fOpName + " Op Input Tensor " + it + " is not found in model"); + throw std::runtime_error("SOFIE Custom " + fOpName + " Op Input Tensor " + it + " is not found in model"); } fInputSizes.push_back(ConvertShapeToLength(model.GetTensorShape(it))); } if(fOutputNames.size() != fOutputShapes.size()){ - throw std::runtime_error("TMVA SOFIE Custom "+ fOpName + " Op was not intialized with the names/shapes of all the output tensors"); + throw std::runtime_error("SOFIE Custom "+ fOpName + " Op was not intialized with the names/shapes of all the output tensors"); } for(long unsigned int i=0; i & namesX, const std::string & nameY): fNInputs(namesX.size()), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::EINSUM; for (size_t i = 0; i < namesX.size(); i++) fNInputs[i] = UTILITY::Clean_name(namesX[i]); // parse teh equations to find labels if (!ParseEquation(equation)) - throw std::runtime_error("TMVA SOFIE Einsum Op: Error parsing the equation " + equation); + throw std::runtime_error("SOFIE Einsum Op: Error parsing the equation " + equation); fInputTensorNames.resize(fNInputs.size()); std::transform(fNInputs.begin(), fNInputs.end(), fInputTensorNames.begin(), @@ -128,7 +129,7 @@ public: std::map labelsMap; for ( auto & name : fNInputs) { if (!model.CheckIfTensorAlreadyExist(name)) - throw std::runtime_error(std::string("TMVA SOFIE Einsum Op Input Tensor ") + name + "is not found in model"); + throw std::runtime_error(std::string("SOFIE Einsum Op Input Tensor ") + name + "is not found in model"); // if (model.IsDynamicTensor(name) || model.IsDimInputTensor(name) ) { // // not yet supported @@ -140,7 +141,7 @@ public: std::string labels = fInputLabels[i]; for (size_t j = 0; j < shape.size(); j++) { if (j >= labels.length()) { - throw std::runtime_error(std::string("TMVA SOFIE Einsum Op Input Tensor has invalid label or shape ") + labels + " " + ConvertShapeToString(shape)); + throw std::runtime_error(std::string("SOFIE Einsum Op Input Tensor has invalid label or shape ") + labels + " " + ConvertShapeToString(shape)); } labelsMap[labels[j]] = shape[j]; } @@ -149,7 +150,7 @@ public: // get output shape from label maps for (char l : fOutputLabels) { if (labelsMap.count(l) == 0) - throw std::runtime_error(std::string("TMVA SOFIE Einsum Op : output label ") + std::string(&l) + " is not present in inputs"); + throw std::runtime_error(std::string("SOFIE Einsum Op : output label ") + std::string(&l) + " is not present in inputs"); fShapeY.push_back(labelsMap[l]); } // we need to get the labels we are going to sum @@ -209,7 +210,7 @@ public: opName = "op_" + opName; if (fShapeY.size() != fOutputLabels.length()) { - throw std::runtime_error("TMVA SOFIE Einsum Op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Einsum Op called to Generate without being initialized first"); } // function to write compute expression index from strides diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx b/core/inc/SOFIE/ROperator_Elu.hxx similarity index 86% rename from src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx rename to core/inc/SOFIE/ROperator_Elu.hxx index 34e18a6..43b1886 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx +++ b/core/inc/SOFIE/ROperator_Elu.hxx @@ -27,6 +27,7 @@ public: ROperator_Elu(float alpha,std::string nameX, std::string nameY): falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::ELU; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; @@ -34,7 +35,7 @@ public: fType = "float"; } else{ - throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Elu operator"); + throw std::runtime_error("SOFIE Encountered unsupported type parsing a Elu operator"); } } @@ -49,7 +50,7 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Elu Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Elu Op Input Tensor is not found in model"); } fShape = model.GetTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); @@ -59,7 +60,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Elu called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Elu called to Generate without being initialized first"); } std::stringstream out; size_t length = ConvertShapeToLength(fShape); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx b/core/inc/SOFIE/ROperator_Erf.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx rename to core/inc/SOFIE/ROperator_Erf.hxx diff --git a/core/inc/SOFIE/ROperator_Expand.hxx b/core/inc/SOFIE/ROperator_Expand.hxx new file mode 100644 index 0000000..95955ed --- /dev/null +++ b/core/inc/SOFIE/ROperator_Expand.hxx @@ -0,0 +1,347 @@ +#ifndef SOFIE_ROperator_Expand +#define SOFIE_ROperator_Expand + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Expand final : public ROperator{ +private: + + std::vector fShapeX; + std::vector fShape; + std::vector fShapeY; + std::vector fShapeDim; + + std::string fNX; + std::string fNShape; + std::string fNY; + std::string fType; + + bool fInitialized = false; + bool fInitializedShape = false; + bool fInitBroadcast = false; + +public: + ROperator_Expand(){} + ROperator_Expand(std::string nameX, std::string nameShape, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNShape(UTILITY::Clean_name(nameShape)), fNY(UTILITY::Clean_name(nameY)){ + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + + void Initialize(RModel& model) override { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("SOFIE Expand Op Input Tensor " + fNX + " is not found in model"); + } + fShapeX = model.GetDimTensorShape(fNX); + if (model.IsInitializedTensor(fNShape)) { + fInitializedShape = true; + int64_t *shapeData = + static_cast(model.GetInitializedTensorData(fNShape).get()); + fShape = model.GetTensorShape(fNShape); + if (fShape.size() != 1) { + throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor."); + } + size_t N = fShape[0]; + // what do we do if shapeData contains negative values? + for (size_t i = 0; i < N; i++) { + if ( shapeData[i] < 0) + throw std::runtime_error("TMVA::SOFIE - Expand: invalid shape value " + std::to_string(shapeData[i])); + } + std::vector shape(shapeData, shapeData + N); + fShapeDim = ConvertShapeToDim(shape); + } else if (model.IsShapeTensor(fNShape)) { + // case input shape is a shape tensor + fShapeDim = model.GetShapeTensorValues(fNShape); + fInitializedShape = true; + } else { + // assume shape of input shape is known (size is 1) + auto shapeOfInputShape = model.GetTensorShape(fNShape); + fShapeDim.resize(shapeOfInputShape[0]); + for (size_t i = 0; i < fShapeDim.size(); i++) { + fShapeDim[i] = Dim{std::string("v_") + fNShape + "_" + std::to_string(i)}; + model.AddShapeParam(fShapeDim[i].param); + } + } + // Y is the common shape of fShapeX and shape + auto ret = SOFIE::UTILITY::MultidirectionalBroadcastShape(fShapeX, fShapeDim); + fShapeY = ret.second; + fInitialized = model.IsInitializedTensor(fNX) && fInitializedShape; + std::vector shapeX; + std::vector shapeY; + // case shape tensor and input shape are known + if (!model.IsDynamicTensor(fNX) && !model.IsDimInputTensor(fNX) && fInitializedShape) { + shapeX = ConvertShapeToInt(fShapeX); + shapeY = ConvertShapeToInt(fShapeY); + if (!UTILITY::AreSameShape(shapeX, shapeY)) + fInitBroadcast = true; + } + if (fInitialized) { + // cannot have Dim initialized tensors + assert(!shapeX.empty() && !shapeY.empty()); + // Broadcast X to the common shape shapeY + // If X is an initialized tensor (constant) + auto data = model.GetInitializedTensorData(fNX); + if (fInitBroadcast) { + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), shapeX, shapeY), + std::default_delete()); + // Update the data and the shape of X + model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), shapeY, broadcastedData); + fShapeX = fShapeY; + // need to set as a not writable tensor + model.SetNotWritableInitializedTensor(fNX); + data = broadcastedData; + } + if (fInitBroadcast || model.IsConstantTensor(fNX)) { + fIsOutputConstant = true; // constant output in this case + model.AddConstantTensor(fNY, model.GetTensorType(fNX), shapeY, data); + fOutputTensorNames.pop_back(); + } else { + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), shapeY); + } + } else { + // // case input is not initialized + // if (shapeX.empty() && shapeDim.empty()) { + + // } + // if (fInitializedShape) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + } + fType = ConvertTypeToString(model.GetTensorType(fNX)); + if (model.Verbose()) { + std::cout << "Expand - input " << fNX << " shape " << ConvertDimShapeToString(fShapeX) << " --> " << fNY << " shape " + << ConvertDimShapeToString(fShapeY) << (fIsOutputConstant ? ConvertValuesToString(model.GetTensorData(fNY)) + " (constant)" : "") << std::endl; + } + + if (fInitializedShape && model.IsInitializedTensor(fNShape)) { + // Shape values are fully consumed into fShapeY/fShapeDim at generation time — + // no device buffer needed for fNShape for Heterogeneous inference + model.SetNotWritableInitializedTensor(fNShape); + } + } + + std::string GenerateInitCode() override { + std::stringstream out; + if (!fIsOutputConstant && fInitialized && !fInitBroadcast) { + // shapeX and shapeY are the same in this case + auto length = ConvertDimShapeToLength(fShapeY); + out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n"; + out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; + } + return out.str(); + } + + std::string Generate(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE Expand Op called to Generate without being initialized first"); + } + std::stringstream out; + out << SP << "\n//------ Expand " << opName << " --> " << ConvertDimShapeToString(fShapeY) << "\n"; + // need to declare shape parameters for non initialized shapes + if (!fInitializedShape) { + for (size_t i = 0; i < fShapeDim.size(); i++) { + out << SP << "size_t " << fShapeDim[i] << " = " << "tensor_" << fNShape << "[" << i << "];\n"; + } + } + // No need to broadcast A if it's an initialized tensor or shapes are the same + if (!fInitialized && fShapeX != fShapeY) { + out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n"; + out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNX << ", " << ConvertDimShapeToString(fShapeX) << ", " << ConvertDimShapeToString(fShapeY) + << ", tensor_"<& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + if (!isStatic(fShapeX) || !isStatic(fShapeY)) return ""; + + // Check if broadcast is actually needed + bool needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + needsBroadcast = std::any_of(fShapeX.begin(), fShapeX.end(), + [&](const Dim& d) { + size_t i = &d - fShapeX.data(); + return fShapeX[i].dim != fShapeY[i].dim; + }); + } + if (!needsBroadcast) return ""; // same static shape — just a memcpy + + const std::size_t D = fShapeY.size(); + + // Left-pad fShapeX with dim=1 entries to match rank of fShapeY + std::vector shapeX_padded(D, 1); + size_t offset = D - fShapeX.size(); + for (size_t i = 0; i < fShapeX.size(); ++i) + shapeX_padded[offset + i] = fShapeX[i].dim; + + std::vector shapeY_int(D); + for (size_t i = 0; i < D; ++i) + shapeY_int[i] = fShapeY[i].dim; + + auto stridesX = UTILITY::ComputeStrideFromShape(shapeX_padded); + auto stridesY = UTILITY::ComputeStrideFromShape(shapeY_int); + std::size_t totalElements = ConvertShapeToLength(shapeY_int); + + std::string kname = "ExpandKernel_" + opName; + + std::string op; + op = "\n//------ EXPAND_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose output linear index using compile-time output strides + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(shapeY_int[d]) + "u;\n"; + } + op += "\n"; + + // Input index: broadcast dims (shapeX_padded[d]==1) contribute 0 — + // compiler eliminates zero terms entirely, no runtime branch + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeX_padded[d] == 1) { + op += SP + SP + SP + SP + SP + "0u"; + } else { + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesX[d]) + "u"; + } + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; // end grid-stride loop + op += SP + SP + "}\n"; // end operator() + op += SP + "};\n"; // end struct + + return op; +} + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + if (fInitialized) return ""; + + auto isStatic = [](const std::vector& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + if (!isStatic(fShapeX) || !isStatic(fShapeY)) return ""; + + // Check if broadcast is actually needed + bool needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + for (size_t i = 0; i < fShapeX.size(); ++i) + if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; } + } + if (!needsBroadcast) return ""; + + opName = "op_" + opName; + std::string kname = "ExpandKernel_" + opName; + return SP + kname + " expandKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Operator Expand called to Generate without being initialized first"); + + std::stringstream out; + out << "\n//------ EXPAND_GPU_ALPAKA\n"; + + if (fInitialized && !fInitBroadcast) { + // GenerateInitCode already handled the copy — nothing to do at inference time + return ""; + } + + auto isStatic = [](const std::vector& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + bool staticShapes = isStatic(fShapeX) && isStatic(fShapeY); + + // Check if broadcast is actually needed for static shapes + bool needsBroadcast = !staticShapes; // dynamic always needs runtime broadcast + if (staticShapes) { + needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + for (size_t i = 0; i < fShapeX.size(); ++i) + if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; } + } + } + + if (!needsBroadcast) { + // Same static shape — device-to-device copy + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY + << ", deviceBuf_" << fNX << ");\n"; + out << SP << "alpaka::wait(queue);\n"; + return out.str(); + } + + if (!staticShapes) { + // Dynamic shapes — not yet supported on GPU, throw a clear error + throw std::runtime_error( + "SOFIE Expand GPU: dynamic shapes are not yet supported for GPU inference. " + "Tensor " + fNX + " has a dynamic shape."); + } + + // Static broadcast — launch the expand kernel + std::vector shapeY_int(fShapeY.size()); + for (size_t i = 0; i < fShapeY.size(); ++i) + shapeY_int[i] = fShapeY[i].dim; + std::size_t totalElements = ConvertShapeToLength(shapeY_int); + std::string kname = "expandKernel_" + opName; + + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::enqueue(queue, task_" << opName << ");\n"; + + return out.str(); +} +}; +}//SOFIE + +#endif //SOFIE_ROperator_Expand diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx b/core/inc/SOFIE/ROperator_EyeLike.hxx similarity index 89% rename from src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx rename to core/inc/SOFIE/ROperator_EyeLike.hxx index 8e94e1c..91103ef 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx +++ b/core/inc/SOFIE/ROperator_EyeLike.hxx @@ -40,11 +40,11 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE EyeLike Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE EyeLike Op Input Tensor is not found in model"); } fShape = model.GetTensorShape(fNX); if (fShape.size() != 2) - throw std::runtime_error("TMVA SOFIE EyeLike Op Input Tensor is not of rank 2"); + throw std::runtime_error("SOFIE EyeLike Op Input Tensor is not of rank 2"); if(fdtype){ ETensorType extractedType = static_cast(fdtype); @@ -59,7 +59,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator EyeLike called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator EyeLike called to Generate without being initialized first"); } auto length = ConvertShapeToLength(fShape); auto stride = SOFIE::UTILITY::ComputeStrideFromShape(fShape); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx b/core/inc/SOFIE/ROperator_GRU.hxx similarity index 92% rename from src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx rename to core/inc/SOFIE/ROperator_GRU.hxx index bb1a74e..037e016 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx +++ b/core/inc/SOFIE/ROperator_GRU.hxx @@ -11,7 +11,6 @@ #include #include - namespace SOFIE { /*! \brief Gated Recurrent Unit operator @@ -91,7 +90,7 @@ template class ROperator_GRU final : public ROperator { fNSequence_lens(UTILITY::Clean_name(nameSequence_lens)), fNInitial_h(UTILITY::Clean_name(nameInitial_h)), fNY(UTILITY::Clean_name(nameY)), fNY_h(UTILITY::Clean_name(nameY_h)) { - + fInputTensorNames = { fNX, fNW, fNR }; if (!fNB.empty()){ fInputTensorNames.emplace_back(fNB); @@ -115,7 +114,7 @@ template class ROperator_GRU final : public ROperator { fType = "float"; } else { throw std::runtime_error( - "TMVA SOFIE Encountered unsupported type parsing a GRU operator"); + "SOFIE Encountered unsupported type parsing a GRU operator"); } } @@ -123,39 +122,34 @@ template class ROperator_GRU final : public ROperator { * * \param input type of the input tensors */ - std::vector TypeInference(std::vector /*input*/); + std::vector TypeInference(std::vector /*input*/) override; /*! \brief Infers the shape of the output tensors * * \param input shape of the input tensors */ - std::vector> ShapeInference(std::vector> /*input*/); + std::vector> ShapeInference(std::vector> /*input*/) override; /*! \brief Initialize the model * * \param model Model */ - void Initialize(RModel &); + void Initialize(RModel &) override; /*! \brief Generate the inference code * * \param OpName name of the operator */ - std::string Generate(std::string /*OpName*/); - - /*! \brief Generate the code for the Session internal data vectors - * - * \param opName name of the operator - */ - std::string GenerateSessionMembersCode(std::string opName); + std::string Generate(std::string /*OpName*/) override; /*! \brief Returns the blas routines needed to compile the generated code */ - std::vector GetBlasRoutines() { return { std::string("Gemm"), std::string("Axpy") }; } + std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } }; } // namespace SOFIE + // Implementation of the ROperator_GRU class #include "SOFIE/ROperator_GRU.icc" diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc b/core/inc/SOFIE/ROperator_GRU.icc similarity index 93% rename from src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc rename to core/inc/SOFIE/ROperator_GRU.icc index f3813c2..f24460c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc +++ b/core/inc/SOFIE/ROperator_GRU.icc @@ -38,33 +38,33 @@ void ROperator_GRU::Initialize(RModel& model){ fUseSession = model.UseSession(); // Check the input and output tensors if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNX + " is not found in model."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNX + " is not found in model."); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNX + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNX + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNW + " is not found in model."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNW + " is not found in model."); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNW + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNW + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNR)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNR + " is not found in model."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNR + " is not found in model."); } fShapeR = model.GetTensorShape(fNR); if (fShapeR.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNR + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNR + " is not of 3 dimensions."); } if (!fNB.empty()) { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE GRU op input tensor " + fNB + " is not found in model."); + throw std::runtime_error("SOFIE GRU op input tensor " + fNB + " is not found in model."); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() != 2 && fShapeB.size() != 4) { - throw std::runtime_error("TMVA SOFIE GRU op input tensor " + fNB + " is not of 2 or 4 dimensions."); + throw std::runtime_error("SOFIE GRU op input tensor " + fNB + " is not of 2 or 4 dimensions."); } if (fShapeB.size() == 2) { // Broadcasting the bias @@ -99,25 +99,25 @@ void ROperator_GRU::Initialize(RModel& model){ } if (!fNSequence_lens.empty()) { if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNSequence_lens + "is not found in model."); } fShapeSequence_lens = model.GetTensorShape(fNSequence_lens); if (fShapeSequence_lens.size() != 1) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNSequence_lens + " is not of 1 dimension."); } } if (!fNInitial_h.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNInitial_h + " is not found in model."); } fShapeInitial_h = model.GetTensorShape(fNInitial_h); if (fShapeInitial_h.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNInitial_h + " is not of 3 dimensions."); } } @@ -141,7 +141,7 @@ void ROperator_GRU::Initialize(RModel& model){ activation != "ScaledTanh" && activation != "HardSigmoid" && activation != "Elu" && activation != "Softsign" && activation != "Softplus") { - throw std::runtime_error("TMVA SOFIE - Activation function " + + throw std::runtime_error("SOFIE - Activation function " + activation + " not implemented"); } } @@ -150,22 +150,22 @@ void ROperator_GRU::Initialize(RModel& model){ fAttrDirection != "reverse" && fAttrDirection != "bidirectional") { throw std::runtime_error( - "TMVA SOFIE - Invalid GRU direction fAttrDirection = " + + "SOFIE - Invalid GRU direction fAttrDirection = " + fAttrDirection); } if (3 * fAttrHiddenSize != fShapeW[1]) { throw std::runtime_error( - "TMVA SOFIE - fAttrHiddenSize must be equal to " + + "SOFIE - fAttrHiddenSize must be equal to " + std::to_string(fShapeW[1] / 3)); } if (fAttrLayout > 1) { - throw std::runtime_error("TMVA SOFIE - Layout fAttrLayout = " + + throw std::runtime_error("SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + " must be 0 (timewise) or 1 (batchwise)"); } if (fAttrLinearBeforeReset > 1) { throw std::runtime_error( - "TMVA SOFIE - fAttrInputForget = " + std::to_string(fAttrLinearBeforeReset) + "SOFIE - fAttrInputForget = " + std::to_string(fAttrLinearBeforeReset) + " must be 0 or 1."); } if (fAttrActivations.empty()) { @@ -175,51 +175,45 @@ void ROperator_GRU::Initialize(RModel& model){ fAttrActivations = {"Sigmoid", "Tanh"}; } } -} -// generate code for Session data members (e.g. internal vectors) -template -std::string ROperator_GRU::GenerateSessionMembersCode(std::string opName) -{ - opName = "op_" + opName; - std::stringstream out; + // To get unique intermediate tensor names, we add the name of the input + // tensor. One might also consider using the index of the operator in the + // RMode, but this information is not available in the current scope. + std::string opName = "op_gru_" + fNX; size_t num_directions = fShapeW[0]; size_t seq_length = (fAttrLayout == 0) ? fShapeX[0] : fShapeX[1]; size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0]; size_t input_size = fShapeX[2]; + auto declareVector = [&](std::string const &name, std::size_t n){ + std::string fullName = opName + "_" + name; + model.AddIntermediateTensor(fullName, ConvertStringToType(fType), std::vector{n}); + }; + if (fAttrLayout != 0) { - out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">(" - << seq_length * batch_size * input_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; + declareVector("input", seq_length * batch_size * input_size); + declareVector("initial_hidden_state", num_directions * batch_size * fAttrHiddenSize); + declareVector("initial_cell_state", num_directions * batch_size * fAttrHiddenSize); } // Set the feedforward size_t ff_size = seq_length * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_update_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_reset_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_hidden_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; + declareVector("f_update_gate", ff_size); + declareVector("f_reset_gate", ff_size); + declareVector("f_hidden_gate", ff_size); // gate results size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_update_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_reset_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("update_gate", hs_size); + declareVector("reset_gate", hs_size); + declareVector("hidden_gate", hs_size); // feedback - out << "std::vector<" << fType << "> fVec_" << opName << "_feedback = std::vector<" << fType << ">(" - << batch_size * fAttrHiddenSize << ");\n"; + declareVector("feedback", batch_size * fAttrHiddenSize); // hiddden state if (fAttrLayout != 0 || fNY.empty()) { - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("hidden_state", hs_size); } - - out << "\n"; - - return out.str(); } @@ -234,12 +228,14 @@ auto ROperator_GRU::Generate(std::string OpName) size_t input_size = fShapeX[2]; size_t num_directions = fShapeW[0]; + auto getVec = [&](std::string const &name) { return "tensor_op_gru_" + fNX + "_" + name; }; + // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const* " << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; + out << SP << fType << " * " << OpName << "_input = " << getVec("input") << ";\n"; } else { out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n"; } @@ -261,8 +257,7 @@ auto ROperator_GRU::Generate(std::string OpName) << fNInitial_h << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_initial_hidden_state = fVec_" << OpName - << "_initial_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_initial_hidden_state = " << getVec("initial_hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size * fAttrHiddenSize << "];\n"; @@ -283,9 +278,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the feedforward size_t feedforward_size = seq_length * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_f_update_gate = fVec_" << OpName << "_f_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_reset_gate = fVec_" << OpName << "_f_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_hidden_gate = fVec_" << OpName << "_f_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_f_update_gate = " << getVec("f_update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_reset_gate = " << getVec("f_reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_hidden_gate = " << getVec("f_hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_f_update_gate[" << feedforward_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_f_reset_gate[" << feedforward_size << "] = {0};\n"; @@ -294,9 +289,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the gates size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_update_gate = fVec_" << OpName << "_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_reset_gate = fVec_" << OpName << "_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_hidden_gate = fVec_" << OpName << "_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_update_gate = " << getVec("update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_reset_gate = " << getVec("reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_hidden_gate = " << getVec("hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_update_gate[" << hidden_state_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_reset_gate[" << hidden_state_size << "] = {0};\n"; @@ -307,14 +302,14 @@ auto ROperator_GRU::Generate(std::string OpName) out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_hidden_state = fVec_" << OpName << "_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_hidden_state = " << getVec("hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n"; } } if (fUseSession) { - out << SP << fType << " * " << OpName << "_feedback = fVec_" << OpName << "_feedback.data();\n"; + out << SP << fType << " * " << OpName << "_feedback = " << getVec("feedback") << ";\n"; } else { out << SP << fType << " " << OpName << "_feedback[" << batch_size * fAttrHiddenSize << "] = {0};\n"; } diff --git a/core/inc/SOFIE/ROperator_Gather.hxx b/core/inc/SOFIE/ROperator_Gather.hxx new file mode 100644 index 0000000..3c16f18 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Gather.hxx @@ -0,0 +1,400 @@ +#ifndef SOFIE_ROPERATOR_GATHER +#define SOFIE_ROPERATOR_GATHER + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include + + +namespace SOFIE{ + +class ROperator_Gather final : public ROperator +{ +private: + + int64_t fAttrAxis = 0; + + std::string fNX; + std::string fNIndices; + std::string fNY; + + std::vector fShapeX; + std::vector fShapeIndices; + std::vector fShapeY; + + std::vector fIndices; // indices vector in case they are known at initialization + + std::string fType; + +public: + ROperator_Gather(){} + ROperator_Gather(int64_t attrAxis, std::string nameX, std::string nameIndices, std::string nameY): + fAttrAxis(attrAxis), fNX(UTILITY::Clean_name(nameX)), fNIndices(UTILITY::Clean_name(nameIndices)), fNY(UTILITY::Clean_name(nameY)) { + fInputTensorNames = { fNX, fNIndices }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; + return ret; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("SOFIE Gather Op Input Tensor " + fNX + " is not found in model"); + } + fShapeX = model.GetDimTensorShape(fNX); + if (model.Verbose()) + std::cout << "Gather - initial shape " << ConvertDimShapeToString(fShapeX) << " shape of indices " + << ConvertDimShapeToString(model.GetDimTensorShape(fNIndices)) << std::endl; + // fShapeIndices can be dynamic + fShapeIndices = model.GetDimTensorShape(fNIndices); + size_t q = fShapeIndices.size(); + // Axis in range [0, r) where r=rank(X) + size_t r = fShapeX.size(); + // Set the axis + if (fAttrAxis < 0) { + fAttrAxis = fAttrAxis + int64_t(r); + } + + + // case indices tensor is initialized + if (model.IsInitializedTensor(fNIndices)) { + // empty shape Indices is a scalar value for the indices + size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices)); + int64_t* indicesData = static_cast(model.GetInitializedTensorData(fNIndices).get()); + // update indices data in case of negative dim values + for (size_t i = 0; i < indicesLength; i++) { + // move this at generation time? + if (!fShapeX[fAttrAxis].isParam) { + if (indicesData[i] < 0) { + indicesData[i] += fShapeX[fAttrAxis].dim; + } + } + } + // Save in a vector gather Indices of size q + fIndices = std::vector(indicesData, indicesData + indicesLength); + } + // Output shape + if (model.Verbose()) + std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertDimShapeToString(fShapeIndices) << std::endl; + + if (fShapeY.empty()) { + fShapeY.resize(q + r - 1); + if (fAttrAxis > 0) { + // Copy shape of X[0, ..., axis-1) to Shape of Y[0, ..., axis-1) + std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin()); + } + // Set shape of Y[axis, ..., axis + q) + for (size_t i = 0; i < q; i++) { + fShapeY[fAttrAxis + i] = Dim{ fShapeIndices[i]}; + } + // Copy shape of X[axis + 1, ..., r) to shape of Y[axis + q, ... q + r - 1) + std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q); + } + // case input is known (type is an integer) and input indices is a scalar (or vector of size 1) + if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) { + auto shapeX = ConvertShapeToInt(fShapeX); // we assume model is not dynamic + auto shapeY = ConvertShapeToInt(fShapeY); + if (model.GetTensorType(fNX) == ETensorType::INT64) { + auto inputData = static_cast(model.GetInitializedTensorData(fNX).get()); + // if q <=1 and r = 1 output length = 1 (it is a scalar) + std::vector outputData(1); //ConvertShapeToLength(shapeY)); + outputData[0] = inputData[fIndices[0]]; + model.AddConstantTensor(fNY, shapeY, outputData.data()); + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertShapeToString(shapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(shapeY) + << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl; + fIsOutputConstant = true; + } + } + // case input is a shape tensor (r is == 1 by definition) and indices are known + else if (model.IsShapeTensor(fNX) && q <=1 && fIndices.size() > 0) { + auto inputData = model.GetShapeTensorValues(fNX); + // if r == 1 and q<=1 then output length is 1 (is a scalar or tensor of size1) + std::vector outputData(1); + outputData[0] = inputData[fIndices[0]]; + if (outputData[0].isParam) { + fIsOutputConstant = true; + // shapeY can be scalar or vector of size1 + model.AddShapeTensor(fNY, outputData, fShapeY.size() == 0); + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) + << " and values " << ConvertDimShapeToString(outputData) << " (shape) " << std::endl; + } else { + int64_t value = static_cast(outputData[0].dim); + auto shapeY = ConvertShapeToInt(fShapeY); + model.AddConstantTensor(fNY, shapeY, &value); + fIsOutputConstant = true; + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) + << " and values {" << value << "} (constant) " << std::endl; + } + } + if (!fIsOutputConstant) { + // Add output tensor + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + fType = ConvertTypeToString(model.GetTensorType(fNX)); + if (model.Verbose()) + std::cout << "Gather: input " << fNX << " " << ConvertDimShapeToString(fShapeX) << " indices " << fNIndices << ConvertDimShapeToString(fShapeIndices) + << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) << std::endl; + } + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + std::stringstream out; + out << "//--------- Gather " << opName << " --> " << fNY << " " << ConvertDimShapeToString(fShapeY) << "\n"; + if (fIsOutputConstant) { + // no code to generate here for constant output. Tensor output is defined in Session constructor + out << "//--------------------(constant)----------\n"; + return out.str(); + } + // The shape of the output is q + r - 1 + size_t r = fShapeX.size(); + // Indices of shape q + size_t q = fShapeIndices.size(); + // Strides + auto stridesX = UTILITY::ComputeStrideFromShape(fShapeX); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + + // case fIndices is not known we need to correct for negative axis indices at run-time + if (fIndices.empty()) { + auto indicesLength = ConvertDimShapeToLength(fShapeIndices); + out << SP << "// correct in case of negative gather indices\n"; + out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n"; + out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n"; + out << SP << SP << SP << "tensor_" << fNIndices << "[i] += " << fShapeX[fAttrAxis] << ";\n"; + out << SP << "}\n"; + } + + // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}] + // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1) + // iterate in [0 ... axis) [0 ... q) [axis ... r - 1) + // for j_0, j_1, ..., j_{axis-1} + + for (size_t j = 0; j < size_t(fAttrAxis); j++) { + std::string index = "j_" + std::to_string(j); + for (size_t k = 0; k <= j; k++) out << SP; + out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n"; + } + // for i_0, i_1, ..., i_{q - 1} + for (size_t i = 0; i < q; i++) { + std::string index = "i_" + std::to_string(i); + for (size_t k = 0; k <= i + fAttrAxis; k++) out << SP; + out << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n"; + } + // for j_axis, j_{axis + 1}, ..., j_{r - 1} + for (size_t j = fAttrAxis; j + 1 < r; j++) { + std::string index = "j_" + std::to_string(q+j); // annotate index using output axis + for (size_t k = 0; k <= q + j; k++) out << SP; + out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n"; + } + + // add a scope for local variables in case above loop are not done + if (fAttrAxis == 0 && q == 0 && r <= 1) + out << SP << "{ // scalar case \n"; + + // output index + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t y_index = "; + for (size_t j = 0; j < size_t(fAttrAxis); j++) { + if (j > 0) out << " + "; + out << "j_" << j; + if (stridesY[j].dim != 1) out << " * " << stridesY[j]; + } + for (size_t i = 0; i < q; i++) { + if (fAttrAxis + i > 0) out << " + "; + out << "i_" << i; + if (stridesY[fAttrAxis + i].dim != 1) out << " * " << stridesY[fAttrAxis + i]; + } + for (size_t j = fAttrAxis; j + 1 < r; j++) { + if (j + q > 0) out << " + "; + out << "j_" << q+j; + if (stridesY[q+j].dim != 1) out << " * " << stridesY[q+j]; + } + // empty case + if (fAttrAxis == 0 && q == 0 && r <= 1) + out << "0"; + out << ";\n"; + + // input Indices + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t i_index = "; + for (size_t i = 0; i < q; i++) { + if (i > 0) out << " + "; + out << "i_" << i; + if (stridesIndices[i].dim != 1) out << " * " << stridesIndices[i]; + } + // empty case + if (q == 0) + out << "0"; + out << ";\n"; + + // K + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t k = static_cast(" << "tensor_" << fNIndices << "[i_index]" << ");\n"; + // Input + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t x_index = k"; + if (stridesX[fAttrAxis].dim != 1) out << " * " << stridesX[fAttrAxis]; + for (size_t j = 0; j < size_t(fAttrAxis); j++) { + out << " + "; + out << " j_" << j; + if (stridesX[j].dim != 1) out << " * " << stridesX[j]; + } + // for input corresponding stride is axis+1,.... r + // loop is on j from fAttrAxis, so consider stridesX[j+1] + for (size_t j = fAttrAxis; j+1 < r; j++) { + out << " + "; + out << " j_" << q+j; + if (stridesX[j+1].dim != 1) out << " * " << stridesX[j+1]; + } + out << ";\n"; + for (size_t k = 0; k < q + r; k++) out << SP; + out << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n"; + + // end loops j_k, j_{k + 1}, ..., j_{r - 2} + for (size_t j = q+r-1; j > 0; j--) { + for (size_t k = 0; k \n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "int64_t const* __restrict__ indices,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + stridesY[d].GetVal() + "u) % " + + fShapeY[d].GetVal() + "u;\n"; + } + op += "\n"; + + // Output dims [axis ... axis+q) correspond to the indices tensor dims [0 ... q) + // so i_index = sum over i in [0,q): out_{axis+i} * stridesIndices[i] + if (q == 0) { + op += SP + SP + SP + SP + "std::size_t const i_index = 0u;\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const i_index =\n"; + for (std::size_t i = 0; i < q; ++i) { + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(fAttrAxis + i) + + " * " + stridesIndices[i].GetVal() + "u"; + op += (i + 1 < q) ? " +\n" : ";\n"; + } + } + op += "\n"; + + op += SP + SP + SP + SP + "int64_t k = indices[i_index];\n"; + op += SP + SP + SP + SP + "if (k < 0) k += " + fShapeX[fAttrAxis].GetVal() + ";\n"; + op += SP + SP + SP + SP + "if (k < 0) k = 0;\n"; + op += SP + SP + SP + SP + "if (k >= static_cast(" + fShapeX[fAttrAxis].GetVal() + ")) " + + "k = static_cast(" + fShapeX[fAttrAxis].GetVal() + ") - 1;\n\n"; + + // x_index = k * stridesX[axis] + // + sum over j in [0, axis): out_j * stridesX[j] + // + sum over j in [axis+1, r): out_{j-1+q} * stridesX[j] + // (the dims after axis in Y are shifted by q-1 relative to X) + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + op += SP + SP + SP + SP + SP + "static_cast(k) * " + stridesX[fAttrAxis].GetVal() + "u"; + for (std::size_t j = 0; j < static_cast(fAttrAxis); ++j) { + op += " +\n" + SP + SP + SP + SP + SP + + "out_" + std::to_string(j) + " * " + stridesX[j].GetVal() + "u"; + } + for (std::size_t j = fAttrAxis + 1; j < r; ++j) { + // in Y, the coord for X's dim j lives at output dim q + j - 1 + op += " +\n" + SP + SP + SP + SP + SP + + "out_" + std::to_string(q + j - 1) + " * " + stridesX[j].GetVal() + "u"; + } + op += ";\n\n"; + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; +} + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "GatherKernel_" + opName; + return SP + kname + " gatherKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Gather Op called to Generate without being initialized first"); + + auto totalElements = ConvertDimShapeToLength(fShapeY); + std::string kname = "gatherKernel_" + opName; + + std::stringstream out; + out << "\n//------ GATHER_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); +} + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_RELU diff --git a/core/inc/SOFIE/ROperator_GatherND.hxx b/core/inc/SOFIE/ROperator_GatherND.hxx new file mode 100644 index 0000000..ffcdab8 --- /dev/null +++ b/core/inc/SOFIE/ROperator_GatherND.hxx @@ -0,0 +1,297 @@ +#ifndef SOFIE_ROPERATOR_GATHERND +#define SOFIE_ROPERATOR_GATHERND + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include + +namespace SOFIE { + +class ROperator_GatherND final : public ROperator +{ +private: + + int64_t fBatchDims = 0; + + std::string fNData; + std::string fNIndices; + std::string fNY; + + std::vector fShapeData; + std::vector fShapeIndices; + std::vector fShapeY; + + std::string fType; + +public: + ROperator_GatherND() {} + ROperator_GatherND(int64_t batchDims, + std::string nameData, + std::string nameIndices, + std::string nameY) + : fBatchDims(batchDims), + fNData(UTILITY::Clean_name(nameData)), + fNIndices(UTILITY::Clean_name(nameIndices)), + fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNData, fNIndices }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return { input[0] }; + } + + std::vector> ShapeInference(std::vector> input) override { + return { input[0] }; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNData)) + throw std::runtime_error("SOFIE GatherND: data tensor " + fNData + " not found in model"); + if (!model.CheckIfTensorAlreadyExist(fNIndices)) + throw std::runtime_error("SOFIE GatherND: indices tensor " + fNIndices + " not found in model"); + + fShapeData = model.GetTensorShape(fNData); + fShapeIndices = model.GetTensorShape(fNIndices); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + if (r < 1) + throw std::runtime_error("SOFIE GatherND: data rank must be >= 1"); + if (q < 1) + throw std::runtime_error("SOFIE GatherND: indices rank must be >= 1"); + if (b >= std::min(q, r)) + throw std::runtime_error("SOFIE GatherND: batch_dims must be < min(q, r)"); + if (last_idx_dim > r - b) + throw std::runtime_error("SOFIE GatherND: indices_shape[-1] must be <= r - batch_dims"); + + for (size_t i = 0; i < b; ++i) { + if (fShapeData[i] != fShapeIndices[i]) + throw std::runtime_error("SOFIE GatherND: first batch_dims dimensions of data and indices must match"); + } + + // Output shape: batch_dims + indices[0..q-2] + data[b + last_idx_dim .. r-1] + // rank = b + (q - b - 1) + (r - b - last_idx_dim) + // = q + r - last_idx_dim - 1 - b + fShapeY.clear(); + for (size_t i = 0; i < b; ++i) + fShapeY.push_back(fShapeData[i]); + for (size_t i = b; i + 1 < q; ++i) + fShapeY.push_back(fShapeIndices[i]); + for (size_t i = b + last_idx_dim; i < r; ++i) + fShapeY.push_back(fShapeData[i]); + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNData), fShapeY); + fType = ConvertTypeToString(model.GetTensorType(fNData)); + + if (model.Verbose()) + std::cout << "GatherND: data " << ConvertShapeToString(fShapeData) + << " indices " << ConvertShapeToString(fShapeIndices) + << " batch_dims=" << fBatchDims + << " -> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl; + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first"); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + auto stridesData = UTILITY::ComputeStrideFromShape(fShapeData); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + size_t totalOutput = ConvertShapeToLength(fShapeY); + + std::stringstream out; + out << SP << "//--------- GatherND operator " << opName << "\n"; + + out << SP << "for (size_t out_idx = 0; out_idx < " << totalOutput << "; out_idx++) {\n"; + + out << SP << SP << "size_t rem = out_idx;\n"; + size_t Dy = fShapeY.size(); + for (size_t d = 0; d < Dy; ++d) { + out << SP << SP << "size_t oy_" << d << " = rem / " << stridesY[d] << ";\n"; + out << SP << SP << "rem %= " << stridesY[d] << ";\n"; + } + + out << SP << SP << "size_t idx_base = 0;\n"; + for (size_t i = 0; i < b; ++i) + out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n"; + for (size_t i = b; i + 1 < q; ++i) + out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n"; + + out << SP << SP << "size_t data_idx = 0;\n"; + for (size_t i = 0; i < b; ++i) + out << SP << SP << "data_idx += oy_" << i << " * " << stridesData[i] << ";\n"; + + out << SP << SP << "for (size_t k = 0; k < " << last_idx_dim << "; k++) {\n"; + out << SP << SP << SP << "int64_t idx_val = tensor_" << fNIndices + << "[idx_base + k * " << stridesIndices[q - 1] << "];\n"; + out << SP << SP << SP << "if (idx_val < 0) idx_val += " << "static_cast(tensor_" + << fNData << "_shape[" << b << " + k]);\n"; + out << SP << SP << SP << "data_idx += static_cast(idx_val) * " << "data_stride_b_plus_k_" << opName << "[k];\n"; + out << SP << SP << "}\n"; + + // Accumulate trailing data dims from output coords + // Y dims [b + (q-b-1) .. ] correspond to data dims [b + last_idx_dim .. r-1] + size_t y_trailing_start = b + (q - b - 1); + for (size_t i = b + last_idx_dim; i < r; ++i) { + size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim)); + out << SP << SP << "data_idx += oy_" << oy_dim << " * " << stridesData[i] << ";\n"; + } + + out << SP << SP << "tensor_" << fNY << "[out_idx] = tensor_" << fNData << "[data_idx];\n"; + out << SP << "}\n"; + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first"); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + auto stridesData = UTILITY::ComputeStrideFromShape(fShapeData); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + size_t Dy = fShapeY.size(); + size_t totalOutput = ConvertShapeToLength(fShapeY); + + std::string kname = "GatherNDKernel_" + opName; + + std::string op; + op = "\n//------ GATHERND_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ data,\n"; + op += SP + SP + SP + "int64_t const* __restrict__ indices,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (size_t d = 0; d < Dy; ++d) { + op += SP + SP + SP + SP + "std::size_t const oy_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const idx_base =\n"; + // batch dims: oy_0..oy_{b-1} * stridesIndices[0..b-1] + // outer idx dims: oy_b..oy_{b+(q-b-2)} * stridesIndices[b..q-2] + bool first = true; + for (size_t i = 0; i < q - 1; ++i) { + op += SP + SP + SP + SP + SP + + (first ? "" : "+ ") + + "oy_" + std::to_string(i) + " * " + std::to_string(stridesIndices[i]) + "u\n"; + first = false; + } + if (first) op += SP + SP + SP + SP + SP + "0u\n"; // q==1: scalar index tuple + op += SP + SP + SP + SP + SP + ";\n\n"; + + op += SP + SP + SP + SP + "std::size_t data_idx =\n"; + first = true; + for (size_t i = 0; i < b; ++i) { + op += SP + SP + SP + SP + SP + + (first ? "" : "+ ") + + "oy_" + std::to_string(i) + " * " + std::to_string(stridesData[i]) + "u\n"; + first = false; + } + if (first) op += SP + SP + SP + SP + SP + "0u\n"; + op += SP + SP + SP + SP + SP + ";\n\n"; + + op += SP + SP + SP + SP + "// Read " + std::to_string(last_idx_dim) + "-element index tuple\n"; + for (size_t k = 0; k < last_idx_dim; ++k) { + size_t idx_offset = k; + size_t data_axis = b + k; + op += SP + SP + SP + SP + "{\n"; + op += SP + SP + SP + SP + SP + + "int64_t idx_val = indices[idx_base + " + + std::to_string(idx_offset) + "u];\n"; + op += SP + SP + SP + SP + SP + + "if (idx_val < 0) idx_val += " + + std::to_string(fShapeData[data_axis]) + ";\n"; + op += SP + SP + SP + SP + SP + + "data_idx += static_cast(idx_val) * " + + std::to_string(stridesData[data_axis]) + "u;\n"; + op += SP + SP + SP + SP + "}\n"; + } + op += "\n"; + + size_t y_trailing_start = b + (q - b - 1); + for (size_t i = b + last_idx_dim; i < r; ++i) { + size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim)); + op += SP + SP + SP + SP + + "data_idx += oy_" + std::to_string(oy_dim) + + " * " + std::to_string(stridesData[i]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "output[elem_idx] = data[data_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "GatherNDKernel_" + opName; + return SP + kname + " gatherNDKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "gatherNDKernel_" + opName; + + std::stringstream out; + out << "\n//------ GATHERND_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } +}; + +} // SOFIE + +#endif // SOFIE_ROPERATOR_GATHERND diff --git a/core/inc/SOFIE/ROperator_Gemm.hxx b/core/inc/SOFIE/ROperator_Gemm.hxx new file mode 100644 index 0000000..c1c6c1c --- /dev/null +++ b/core/inc/SOFIE/ROperator_Gemm.hxx @@ -0,0 +1,667 @@ +#ifndef SOFIE_ROPERATOR_GEMM +#define SOFIE_ROPERATOR_GEMM + + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include +#include +#include + + +namespace SOFIE{ + + + template + class ROperator_Gemm final : public ROperator + { + + private: + bool fIsDynamic = false; + bool fBroadcastBias = false; + bool fCheckBiasShapeAtRuntime = false; // flag to identify the need to do a run time check of bias shape compatibility in case of dynamic shapes and uni-directional broadcasting + + float fAttrAlpha = 1.0; + float fAttrBeta = 1.0; + int_t fAttrTransA = 0; + int_t fAttrTransB = 0; + + std::string fNA; + std::string fNB; + std::string fNC = ""; + std::string fNY; + std::string fType; + EActivationType fActivation; + std::vector fShapeA; + std::vector fShapeB; + std::vector fShapeC; + std::vector fDimShapeC; + std::vector fShapeY; + RModel * fModel = nullptr; + + public: + + ROperator_Gemm(){} + ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): + fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), + fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + fActivation = activation; + fType = "float"; + static_assert(std::is_same_v, + "TMVA::SOFIE - Unsupported type parsing a Gemm operator"); + fInputTensorNames = { fNA, fNB }; + fOutputTensorNames = { fNY }; + fKind = OperatorKind::GEMM; + } + + ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameC, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): + fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), + fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation) + { + fActivation = activation; + fType = "float"; + + fInputTensorNames = {fNA, fNB, fNC}; + fOutputTensorNames = { fNY }; + fKind = OperatorKind::GEMM; + } + + std::vector TypeInference(std::vector input) override { + ETensorType out = input[0]; + return {out}; + } + + template + std::vector DoShapeInference(const std::vector> & input){ + if (input.size() > 3) throw std::runtime_error("SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor"); + // accept tensor with input dimensions > 2 + // example: A = (d1,d2,...,N1,N2) B = (d1,d2,...,N2,N3) --> Y = (d1,d2,..,N1,N3) + for (auto& i: input){ + if (i.size() < 2){ + throw std::runtime_error("SOFIE Gemm Op Shape Inference only accept input tensor with >=2 dimensions"); + } + } + + // when there are 3 inputs shape of Y is the one of C + if (input.size() == 3){ + //shape of C is shape of Y + return input[2]; + } + // ioffset cannot be less than 2 + int ioffset = input[0].size()-2; // in case of tensors with dim > 2 + + std::vector s_a(input[0].begin() + ioffset, input[0].begin() + ioffset + 2); + std::vector s_b(input[1].begin() + ioffset, input[1].begin() + ioffset + 2); + // reverse in case of transpose + if (fAttrTransA){ + std::reverse(s_a.begin(), s_a.end()); + } + if (fAttrTransB){ + std::reverse(s_b.begin(), s_b.end()); + } + std::vector s_y; + s_y.reserve(input[0].size()); + if (input[0].size() > 2 && input[1].size() == input[0].size()) { + // in case of dim > 2 first dimensions are equal to the input ones not + // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4)) + // here could probably use the Broadcasting function UTILITY::MultidirectionalBroadcastShape + for (size_t i = 0; i < input[0].size()-2; i++) { + Dim valueA = input[0][i]; + Dim valueB = input[1][i]; + if (valueA.GetVal() != valueB.GetVal()) { + if (valueB.GetVal() == "1") + s_y.push_back(input[0][i]); + else if (valueA.GetVal() == "1") + s_y.push_back(input[1][i]); + else if (!valueA.isParam && !valueB.isParam) + throw std::runtime_error("SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " + + valueB.GetVal()); + else if (valueA.isParam && valueB.isParam){ + // check which parameter is first in RModel list + auto & dimNames = fModel->GetDimShapeNames(); + auto p1 = std::find(dimNames.begin(), dimNames.end(), valueA.param); + auto p2 = std::find(dimNames.begin(), dimNames.end(), valueB.param); + if (p1 < p2) s_y.push_back(input[0][i]); + else s_y.push_back(input[1][i]); + } + else if (!valueA.isParam) + s_y.push_back(input[0][i]); + else if (!valueB.isParam) + s_y.push_back(input[1][i]); + else + throw std::runtime_error("SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " + + valueB.GetVal()); + } + else + s_y.push_back(input[0][i]); + } + } + + s_y.push_back(s_a[0]); + s_y.push_back(s_b[1]); + return s_y; + } + + std::vector> ShapeInference(std::vector> input) override { + std::vector> ret; + ret.push_back(DoShapeInference(input)); + return ret; + } + std::vector DynamicShapeInference(const std::vector> & input){ + return DoShapeInference(input); + } + + + + void Initialize(RModel& model) override { + //TODO: propagate A or B as specified by ONNX standard + fModel = &model; + + if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model"); + } + if (fNC != ""){ + if (model.CheckIfTensorAlreadyExist(fNC) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Gemm Op Input Tensor " + fNC + " is not found in model"); + } + } + if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) { + fShapeA = model.GetDynamicTensorShape(fNA); + fIsDynamic = true; + } else { + auto shapeA_int = model.GetTensorShape(fNA); + fShapeA = ConvertShapeToDim(shapeA_int); + } + // case A is of dim1 we prepend a 1 but we need to remove later + bool prependOne = false; + if (fShapeA.size() == 1) { + fShapeA.insert(fShapeA.begin(), Dim(1)); + prependOne = true; + } + + if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) { + fShapeB = model.GetDynamicTensorShape(fNB); + fIsDynamic = true; + } + else { + auto shapeB_int = model.GetTensorShape(fNB); + fShapeB = ConvertShapeToDim(shapeB_int); + } + // case B is dim1 we append a 1 but we need to remove later + bool appendOne = false; + if (fShapeB.size() == 1) { + fShapeB.insert(fShapeB.end(), Dim(1)); + appendOne = true; + } + // assume if not shape is 2 that extra values are 1. + // implement also MatMul case where we stack matrices (see numpy.matmul) + if (fShapeA.size() != fShapeB.size()) { + // if different dimensions we prepend 1 values + if (fShapeA.size() < fShapeB.size()) { + fShapeA.insert(fShapeA.begin(), fShapeB.size()-fShapeA.size(), Dim(1)); + } else if (fShapeB.size() < fShapeA.size()) { + fShapeB.insert(fShapeB.begin(), fShapeA.size()-fShapeB.size(), Dim(1)); + } + } + + fShapeY = DynamicShapeInference({fShapeA, fShapeB}); + std::vector shapeY = ConvertShapeToInt(fShapeY); + + // bias is normally not dynamic (not support it for time being) + if (fNC != ""){ + if (model.IsDynamicTensor(fNC)) + fDimShapeC = model.GetDynamicTensorShape(fNC); + else { + fShapeC = model.GetTensorShape(fNC); + fDimShapeC = ConvertShapeToDim(fShapeC); + } + // for dynamic outputs broadcasting is always needed + bool broadcast_needed = false; + if (fIsDynamic && shapeY.empty()) + broadcast_needed = true; + else + // consider broadcasting also if they have different length + broadcast_needed = (fShapeC != shapeY); + + + if (broadcast_needed) { + fBroadcastBias = true; + // check if broadcasting is compatible and note that prepend 1 to shapeC + auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, fDimShapeC); + // return flag must not have bit equal to 2 since this is a unidirectional broadcast of C->Y + // + if ((r.first & 2) == 2) { + throw std::runtime_error("SOFIE Gemm Op - bias tensor of shape " + ConvertDimShapeToString(fDimShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY)); + } else if (r.first == 4) { + // we need to do a run time check of bias shape if it is compatible + fCheckBiasShapeAtRuntime = true; + } + fShapeC = ConvertShapeToInt(fDimShapeC); + } + } + + // remove appended or prepended value of 1 in Y + if (prependOne) { + if (fIsDynamic) + fShapeY.erase(fShapeY.begin()); + else + shapeY.erase(shapeY.begin()); + } + if (appendOne) { + if (fIsDynamic) + fShapeY.erase(fShapeY.end()-1); + else + shapeY.erase(shapeY.end()-1); + } + + if (!fIsDynamic) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY); + else + model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY); + + if (model.Verbose()){ + std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape "; + if (fIsDynamic) + std::cout << ConvertDimShapeToString(fShapeY) << std::endl; + else + std::cout << ConvertShapeToString(shapeY) << std::endl; + } + + model.AddNeededStdLib("algorithm"); + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + + // if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) { + // throw std::runtime_error("SOFIE Gemm Op called to Generate without being initialized first"); + // } + std::stringstream out; + out << "\n//--------- Gemm " << opName << " " << ConvertDimShapeToString(fShapeA) << " * " << ConvertDimShapeToString(fShapeB) + << " -> " << ConvertDimShapeToString(fShapeY) << "\n"; + // need to consider case A and B have dim > 2 (for MatMul) + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + int64_t dimY = fShapeY.size(); + int64_t dimC = fDimShapeC.size(); + if (dimA != dimB || dimA != dimY || (fBroadcastBias && dimC != dimY)) { + std::cout << " shape A " << ConvertDimShapeToString(fShapeA) + << " shape B " << ConvertDimShapeToString(fShapeB) + << " shape C " << ConvertDimShapeToString(fDimShapeC) + << " shape Y " << ConvertDimShapeToString(fShapeY) << std::endl; + throw std::runtime_error("SOFIE Gemm(MatMul) has invalid shape for inputs or output"); + } + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + // size of A: if (transposeA) is m*k else k*m + // size of B n*k + std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; + // extra dimensions in case of stacked MatMul + std::vector sExtraY; + for (int64_t i = 0; i < dimY-2; i++) { + sExtraY.push_back(fShapeY[i]); + } + auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation + auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul) + std::string lengthExtra_C; + std::vector sExtraC; + std::vector sC; + bool haveExtraC = false; + if (dimC > 2) { + sC = {fDimShapeC[dimC-2], fDimShapeC[dimC-1]}; + for (int64_t i = 0; i < dimC-2; i++) { + sExtraC.push_back(fDimShapeC[i]); + } + lengthExtra_C = ConvertDimShapeToLength(sExtraC); + if (lengthExtra_C != "1") haveExtraC = true; + } else if (dimC > 0) { + for (int64_t i = 0; i < dimC; i++) { + sC.push_back(fDimShapeC[i]); + } + } + + // case bias is present + if (!fNC.empty()){ + // when the 2 last dims of bias and Y are not compatible we need to perform a run time broadcast + if (sC != sY) fBroadcastBias = true; + if (!fBroadcastBias) { + // add a check in case broadcasting was not needed or done outside of session + // C should have smaller dimension of Y + if (!fIsDynamic) { + if ((std::stoi(lengthGemm) != std::stoi(ConvertDimShapeToLength(sC))) || + ( haveExtraC && std::stoi(lengthExtra_Y) != std::stoi(lengthExtra_C))) + throw std::runtime_error("SOFIE Gemm Op " + opName + " Bias tensor " + fNC + " has not correct size " + + ConvertShapeToString(fShapeC) + " output length " + lengthGemm); + } else { + // add a dynamic check (C should not be a dynamic tensor) + out << SP << "assert(" << lengthGemm << " == " << ConvertDimShapeToLength(sC) << ");\n"; + if (haveExtraC) out << SP << "assert(" << lengthExtra_Y << " == " << lengthExtra_C << ");\n"; + } + } + } else { + fBroadcastBias = false; + //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use + // the previous result + if (fAttrBeta != 0) { + // some model don't have bias but Beta is not zero - force it to zero + fAttrBeta = 0; + std::cout << "WARNING: SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero - force it to zero\n"; + } + } + + // include MatMul case where we stack the Gemm operations + // exclude case where we have only 1's in the additional dims + bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra_Y) > 1); + // compute input offset for stack multiplications + std::string lengthExtra_A; + std::string lengthExtra_B; + std::string increment_A; + std::string increment_B; + + if (doStackMul) { + std::vector sA(fShapeA.begin(), fShapeA.begin()+dimA-2); + std::vector sB(fShapeB.begin(), fShapeB.begin()+dimB-2); + std::vector mA = {fShapeA[dimA-2], fShapeA[dimA-1]}; + std::vector mB = {fShapeB[dimB-2], fShapeB[dimB-1]}; + lengthExtra_A = ConvertDimShapeToLength(sA); + lengthExtra_B = ConvertDimShapeToLength(sB); + // if A ( b, m, k) and B (b, k, n) these are the strides of A and B ( m*k for A and n*k for B ) + increment_A = ConvertDimShapeToLength(mA); + increment_B = ConvertDimShapeToLength(mB); + } + bool extraA = (doStackMul && lengthExtra_A != "1"); + bool extraB = (doStackMul && lengthExtra_B != "1"); + bool extraC = (doStackMul && haveExtraC && !fBroadcastBias); + // run time check for bias broadcasting + std::string biasShapeType = opName + "_biasShapeType"; + if (fBroadcastBias && fCheckBiasShapeAtRuntime) { + // create a flag according to bias shape: + // = 1 for (1,Y2) + // = 2 for (Y1,1) + // = 3 for a scalar + out << SP << "int " << biasShapeType << " = 0;\n"; + // case vector of columns + if (sC[0].GetVal() != "1" && sC[1].GetVal() != sY[1].GetVal()) + out << SP << "if (" << sC[0] << " == 1 && " << sC[1] << " == " << sY[1] << ")\n"; + else if (sC[0].GetVal() == "1") + out << SP << "if (" << sC[1] << " == " << sY[1] << ")\n"; + else if (sC[1].GetVal() == sY[1].GetVal()) + out << SP << "if (" << sC[0] << " == 1)\n"; + + out << SP << SP << biasShapeType << " = 1;\n"; + + // case vector of rows + if (sC[1].GetVal() != "1" && sC[0].GetVal() != sY[0].GetVal()) + out << SP << "else if (" << sC[1] << " == 1 && " << sC[0] << " == " << sY[0] << ")\n"; + else if (sC[1].GetVal() == "1") + out << SP << "else if (" << sC[0] << " == " << sY[0] << ")\n"; + else if (sC[0].GetVal() == sY[0].GetVal()) + out << SP << "else if (" << sC[1] << " == 1)\n"; + + out << SP << SP << biasShapeType << " = 2;\n"; + + // case scalar + if (sC[0].GetVal() != "1" && sC[1].GetVal() != "1") + out << SP << "else if (" << sC[0] << " == 1 && " << sC[1] << " == 1 )\n"; + else if (sC[0].GetVal() == "1") + out << SP << "else if (" << sC[1] << " == 1)\n"; + else if (sC[1].GetVal() == "1") + out << SP << "else if (" << sC[0] << " == 1)\n"; + out << SP << SP << biasShapeType << " = 3;\n"; + out << SP << "else\n"; + out << SP << SP << "throw std::runtime_error(\"SOFIE Gemm Op - bias tensor " + << ConvertDimShapeToString(fDimShapeC) << " cannot be broadcasted to " + << ConvertDimShapeToString(fShapeY) << "\");\n"; + } + auto SP2 = SP; + if (doStackMul) { + out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations + if (extraA) + out << SP << "size_t " << opName << "_A_offset = 0;\n"; + if (extraB) + out << SP << "size_t " << opName << "_B_offset = 0;\n"; + if (extraC) + out << SP << "size_t " << opName << "_C_offset = 0;\n"; + out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n"; + SP2 += SP; + } + // do the bias broadcasting at run time by + // initializing output Y vector with bias values + if (fBroadcastBias) { + + fAttrBeta = 1.; + + // loop on first output dimension + out << SP2 << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n"; + out << SP2 << SP << "size_t y_index = "; + if (doStackMul) // add offset in case of stack multiplications (not sure if bias is present in these cases) + out << opName << "_y_offset + "; + if (sY[1].GetVal() != "1") + out << sY[1] << " * j;\n"; + else + out << "j;\n"; + + std::string prefix = SP2 + SP + "SOFIE::"; + std::string target = "tensor_" + fNY; + if (sC.size() != 2) { + throw std::runtime_error("SOFIE Gemm Op - invalid rank for bias tensor " + ConvertDimShapeToString(fDimShapeC) + ConvertDimShapeToString(sC)); + } if (sC[0].GetVal() == "1" && sC[1].GetVal() == sY[1].GetVal()) { + out << prefix << "Copy(" << target << " + y_index, tensor_" << fNC << ", " << sY[1] << ");\n"; + } else if (sC[1].GetVal() == "1" && sC[0].GetVal() == sY[0].GetVal()) { + out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[j], " << sY[1] << ");\n"; + } else if (sC[0].GetVal() == "1" && sC[1].GetVal() == "1") { + // scalar case + out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[0], " << sY[1] << ");\n"; + } else if (fCheckBiasShapeAtRuntime) { + // in the generic dynamic case we check at run time that bias is compatible + // we check that bias[0] = 1 or equal to SY[0] and that bias[1] = 1 or equal to SY[1] + // tbd: this run-time check coul;d be moved outside the loop for better run time efficiency + out << SP2 << SP << "if (" << biasShapeType << " == 1)\n"; // case vector of columns + out << SP << prefix << "Copy(" << target << " + y_index, tensor_" << fNC << ", " << sY[1] << ");\n"; + out << SP2 << SP << "else if (" << biasShapeType << " == 2)\n"; // case vector of rows + out << SP << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[j], " << sY[1] << ");\n"; + out << SP2 << SP << "else \n"; // scalar case + out << SP << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[0], " << sY[1] << ");\n"; + } else { + throw std::runtime_error("SOFIE Gemm Op - invalid shape for bias tensor " + ConvertDimShapeToString(fDimShapeC)); + } + + out << SP2 << "}\n"; + } + + if (fType == "float"){ + + out << SP2 << "SOFIE::Gemm_Call(" << "tensor_" << fNY; + if (doStackMul) out << " + " << opName << "_y_offset"; + out << ", " + << (fAttrTransB ? "true, " : "false, ") + << (fAttrTransA ? "true, " : "false, ") + << n << ", " << m << ", " << k << ", "; + out << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ", tensor_" << fNB; + if (extraB) out << " + " << opName << "_B_offset"; + out << ", tensor_" << fNA; + if (extraA) out << " + " << opName << "_A_offset"; + out << ", " << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ","; + // in the case of bias and no broadcasting needed - I need to add bias as an extra tensor in Gemm call + if (!fNC.empty() && !fBroadcastBias) { + out << "tensor_" << fNC; + if (extraC) { + out << " + " << opName << "_C_offset"; + } + } else { + out << "nullptr"; + } + out << ");\n"; + + } + + if (doStackMul) { + out << SP << SP << opName << "_y_offset += " << lengthGemm << ";\n"; + if (lengthExtra_A != "1") + out << SP << SP << opName << "_A_offset += " << increment_A << ";\n"; + if (lengthExtra_B != "1") + out << SP << SP << opName << "_B_offset += " << increment_B << ";\n"; + if (extraC) + // increment_C is lengthGEmm + out << SP << SP << opName << "_C_offset += " << lengthGemm << ";\n"; + out << SP << "}\n"; // end of loop on the stacked multiplication + } + + // fuse with Relu + if(fActivation == EActivationType::RELU){ + out << SP << "//--- applying RELU to output\n"; + std::string tnsr = "tensor_" + fNY; + std::string reluSize = ConvertDimShapeToLength(fShapeY); + out << SP << "SOFIE::Relu(" << tnsr << ", " << tnsr << ", " << reluSize << ");\n"; + } + + return out.str(); + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + + if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fDimShapeC.empty())) { + throw std::runtime_error("SOFIE Gemm Op called to Generate without being initialized first"); + } + std::stringstream out; + out << "\n//--------- Gemm_GPU_ALPAKA\n"; + out << SP << "alpaka::wait(queue);\n"; + out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n"; + out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n"; + // need to consider case A and B have dim > 2 (for MatMul) + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + int64_t dimY = fShapeY.size(); + if (dimA != dimB || dimA != dimY) { + throw std::runtime_error("SOFIE Gemm(MatMul) has invalid shape for inputs or output"); + } + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; + // extra dimensions in case of stacked MatMul + std::vector sA; + for (int64_t i = 0; i < dimY-2; i++) { + sA.push_back(fShapeY[i]); + } + auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation + auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) + + out << SP << "int " << opName << "_m = " << m << ";\n"; + out << SP << "int " << opName << "_n = " << n << ";\n"; + out << SP << "int " << opName << "_k = " << k << ";\n"; + out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ";\n"; + + // restricting to a 0 beta since BIAS is configured separately through sofieBLAS interface + out << SP << "float " << opName << "_beta = 0;\n"; + + // case bias is present + if (!fNC.empty()){ + if (!fBroadcastBias) { + // add a check in case broadcasting was not needed or done outside of session + // C should have same size as Y + if (!fIsDynamic) { + if (std::stoi(lengthGemm) != static_cast(ConvertShapeToLength(fShapeC))) + throw std::runtime_error("SOFIE Gemm Op " + opName + " Bias tensor has not correct size " + + ConvertDimShapeToString(fDimShapeC) + " output length " + lengthGemm); + } else { + // add a dynamic check (C should equal output size) + out << SP << "assert(" << lengthGemm << " == " << ConvertDimShapeToLength(fDimShapeC) << ");\n"; + } + } + } else { + fBroadcastBias = false; + //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use + // the previous result + if (fAttrBeta != 0) { + // some model don't have bias but Beta is not zero - force it to zero + fAttrBeta = 0; + std::cout << "WARNING: SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero - force it to zero\n"; + } + } + + // include MatMul case where we stack the Gemm operations + // exclude case where we have only 1's in the additional dims + bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra) > 1); + + // Compute per-iteration strides for each buffer when stacking. + // m/n/k are std::string from Dim::GetVal(); stoi() is safe for static shapes. + size_t strideA = 0, strideB = 0, strideY = 0, strideC = 0; + if (doStackMul && !fIsDynamic) { + strideA = static_cast(std::stoi(m)) * static_cast(std::stoi(k)); + strideB = static_cast(std::stoi(n)) * static_cast(std::stoi(k)); + strideY = static_cast(std::stoi(m)) * static_cast(std::stoi(n)); + strideC = !fNC.empty() ? static_cast(std::stoi(lengthGemm)) : 0; + out << SP << "size_t " << opName << "_yoffset = 0;\n"; + out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n"; + } else if (doStackMul) { + // Dynamic case: emit symbolic stride expressions + out << SP << "size_t " << opName << "_yoffset = 0;\n"; + out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n"; + } + + // in the case of bias + if (!fNC.empty()){ + // Use getPtrNative() for all args so the raw-pointer overload is selected regardless + // of whether each buffer is a BufXxx (member weight/bias/output) or ViewPlainPtr (input view). + std::string pA = "alpaka::getPtrNative(deviceBuf_" + fNA + ")"; + std::string pB = "alpaka::getPtrNative(deviceBuf_" + fNB + ")"; + std::string pC = "alpaka::getPtrNative(deviceBuf_" + fNC + ")"; + std::string pY = "alpaka::getPtrNative(deviceBuf_" + fNY + ")"; + if (doStackMul && !fIsDynamic) { + pA += " + i * " + std::to_string(strideA); + pB += " + i * " + std::to_string(strideB); + pY += " + i * " + std::to_string(strideY); + if (!fBroadcastBias && strideC > 0) pC += " + i * " + std::to_string(strideC); + } + if (fActivation == EActivationType::RELU){ + out << SP << "blas.gemmrelu("< GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; } + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNY); + fNY = fusable_tensor_name; + fOutputTensorNames[0] = fNY; + } + + std::string GetBlasConfig(){ + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + auto lda = (fAttrTransA ? m : k); + auto ldb = (fAttrTransB ? k : n); + auto ldc = n; + return n+", "+m+", "+k+", "+ldb+", "+lda+", "+ldc+", "+(fAttrTransB ? "'t'" : "'n'")+", "+(fAttrTransA ? "'t'" : "'n'"); + } + }; + + +}//SOFIE + +#endif //SOFIE_ROPERATOR_GEMM diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx b/core/inc/SOFIE/ROperator_Identity.hxx similarity index 93% rename from src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx rename to core/inc/SOFIE/ROperator_Identity.hxx index efb6b14..d68b00c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx +++ b/core/inc/SOFIE/ROperator_Identity.hxx @@ -41,7 +41,7 @@ public: void Initialize(RModel& model) override { //input must be a graph input, or already initialized intermediate tensor if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE Identity Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Identity Op Input Tensor is not found in model"); } fShape = model.GetTensorShape(fNX); if (model.IsInitializedTensor(fNX)) { @@ -77,7 +77,7 @@ public: if (fIsOutputConstant || fIsInputInitialized) return ""; OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Identity called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Identity called to Generate without being initialized first"); } std::stringstream out; out << "\n//------ IDENTITY\n"; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx b/core/inc/SOFIE/ROperator_LSTM.hxx similarity index 98% rename from src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx rename to core/inc/SOFIE/ROperator_LSTM.hxx index 5bfd4e3..69fb7a2 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx +++ b/core/inc/SOFIE/ROperator_LSTM.hxx @@ -106,7 +106,7 @@ template class ROperator_LSTM final : public ROperator { fType = "float"; } else { throw std::runtime_error( - "TMVA SOFIE Encountered unsupported type parsing a LSTM operator"); + "SOFIE Encountered unsupported type parsing a LSTM operator"); } fInputTensorNames = { fNX, fNW, fNR }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc b/core/inc/SOFIE/ROperator_LSTM.icc similarity index 97% rename from src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc rename to core/inc/SOFIE/ROperator_LSTM.icc index bec7760..2fb390d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc +++ b/core/inc/SOFIE/ROperator_LSTM.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_LSTM_I #define SOFIE_ROPERATOR_LSTM_I - namespace SOFIE { template @@ -41,33 +40,33 @@ auto ROperator_LSTM::Initialize(RModel& model) fUseSession = model.UseSession(); // Check the input and output tensors if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNX + " is not found in model."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNX + " is not found in model."); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNX + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNX + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNW + " is not found in model."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNW + " is not found in model."); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNW + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNW + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNR)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNR + " is not found in model."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNR + " is not found in model."); } fShapeR = model.GetTensorShape(fNR); if (fShapeR.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNR + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNR + " is not of 3 dimensions."); } if (!fNB.empty()) { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNB + " is not found in model."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNB + " is not found in model."); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() != 2 && fShapeB.size() != 5) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNB + " is not of 2 or 5 dimensions."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNB + " is not of 2 or 5 dimensions."); } if (fShapeB.size() == 2) { // Broadcasting the bias @@ -104,46 +103,46 @@ auto ROperator_LSTM::Initialize(RModel& model) } if (!fNSequence_lens.empty()) { if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNSequence_lens + "is not found in model."); } fShapeSequence_lens = model.GetTensorShape(fNSequence_lens); if (fShapeSequence_lens.size() != 1) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNSequence_lens + " is not of 1 dimension."); } } if (!fNInitial_h.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_h + " is not found in model."); } fShapeInitial_h = model.GetTensorShape(fNInitial_h); if (fShapeInitial_h.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_h + " is not of 3 dimensions."); } } if (!fNInitial_c.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_c)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_c + " is not found in model."); } fShapeInitial_c = model.GetTensorShape(fNInitial_c); if (fShapeInitial_c.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_c + " is not of 3 dimensions."); } } if (!fNP.empty()) { if (!model.CheckIfTensorAlreadyExist(fNP)) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNP + " is not found in model."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNP + " is not found in model."); } fShapeP = model.GetTensorShape(fNP); if (fShapeP.size() != 2 && fShapeP.size() != 4) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNP + " is not of 2 or 4 dimensions."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNP + " is not of 2 or 4 dimensions."); } if (fShapeP.size() == 2) { // Broadcasting the weight for peepholes @@ -197,28 +196,28 @@ auto ROperator_LSTM::Initialize(RModel& model) activation != "ScaledTanh" && activation != "HardSigmoid" && activation != "Elu" && activation != "Softsign" && activation != "Softplus") { - throw std::runtime_error("TMVA SOFIE - Activation function " + + throw std::runtime_error("SOFIE - Activation function " + activation + " not implemented"); } } if (fAttrDirection != "forward" && fAttrDirection != "backward" && fAttrDirection != "bidirectional") { throw std::runtime_error( - "TMVA SOFIE - Invalid LSTM direction fAttrDirection = " + + "SOFIE - Invalid LSTM direction fAttrDirection = " + fAttrDirection); } if (4 * fAttrHiddenSize != fShapeW[1]) { throw std::runtime_error( - "TMVA SOFIE - fAttrHiddenSize must be equal to " + + "SOFIE - fAttrHiddenSize must be equal to " + std::to_string(fShapeW[1] / 4)); } if (fAttrInputForget > 1) { throw std::runtime_error( - "TMVA SOFIE - fAttrInputForget = " + std::to_string(fAttrInputForget) + "SOFIE - fAttrInputForget = " + std::to_string(fAttrInputForget) + " must be 0 or 1."); } if (fAttrLayout > 1) { - throw std::runtime_error("TMVA SOFIE - Layout fAttrLayout = " + + throw std::runtime_error("SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + " must be 0 (timewise) or 1 (batchwise)"); } @@ -291,7 +290,7 @@ auto ROperator_LSTM::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; diff --git a/core/inc/SOFIE/ROperator_LayerNormalization.hxx b/core/inc/SOFIE/ROperator_LayerNormalization.hxx new file mode 100644 index 0000000..b2f4d90 --- /dev/null +++ b/core/inc/SOFIE/ROperator_LayerNormalization.hxx @@ -0,0 +1,565 @@ +#ifndef SOFIE_ROPERATOR_LAYERNORMALIZATION +#define SOFIE_ROPERATOR_LAYERNORMALIZATION + +#include "SOFIE/RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" +#include +#include + +namespace SOFIE { + +template +class ROperator_LayerNormalization : public ROperator { +private: + bool fCastToFloat = false; // flag to indicate if operation 1 are in floats (to be impl) + int fAttrAxis; + float fAttrEpsilon; + size_t fAttrStashType; + + std::string fNX; + std::string fNScale; + std::string fNB; + std::string fNY; + std::string fNMean; + std::string fNInvStdDev; + + std::string fNCastedX; + std::string fNNormalizedX; + std::string fNBroadcastedB; + + std::vector fShapeX; + std::vector fShapeScale; + std::vector fShapeB; + std::vector fShapeY; + std::vector fShapeMean; + std::vector fShapeInvStdDev; + + size_t fAxis; // axis in [0, size) + size_t fSize; // Size of the input + // size_t fAxisDim; + + std::vector fNormalizedShape; // shape from X[ axis,...,N-1] + std::vector fAxesShape; // shape from X[0,..,axis-1] + // lengths in string format + std::string fLength; // Length of the input + std::string fNormalizedLength; + std::string fAxesLength; + + std::string fType; + +public: + ROperator_LayerNormalization() {} + + ROperator_LayerNormalization(int axis, float epsilon, size_t stashType, const std::string &nameX, + const std::string &nameScale, const std::string &nameB, const std::string &nameY, + const std::string &nameMean, const std::string &nameInvStdDev) + : fAttrAxis(axis), fAttrEpsilon(epsilon), fAttrStashType(stashType), fNX(UTILITY::Clean_name(nameX)), + fNScale(UTILITY::Clean_name(nameScale)), fNB(UTILITY::Clean_name(nameB)), + fNY(UTILITY::Clean_name(nameY)), fNMean(UTILITY::Clean_name(nameMean)), fNInvStdDev(UTILITY::Clean_name(nameInvStdDev)) + { + fInputTensorNames = { fNX, fNScale }; + if (!fNB.empty()){ + fInputTensorNames.emplace_back(fNB); + } + + fOutputTensorNames = { fNY }; + if (!fNMean.empty()){ + fOutputTensorNames.emplace_back(fNMean); + } + if (!fNInvStdDev.empty()){ + fOutputTensorNames.emplace_back(fNInvStdDev); + } + } + + std::vector> ShapeInference(std::vector> input) override { return input; } + + std::vector TypeInference(std::vector input) override { return input; } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found."); + } + bool isDynamic = model.IsDynamicTensor(fNX); + fShapeX = model.GetDimTensorShape(fNX); + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + // Type of the output + fType = ConvertTypeToString(model.GetTensorType(fNX)); + // Size of the input + fSize = fShapeX.size(); + // Axis in [0, size) + fAxis = (fAttrAxis < 0) ? fSize + fAttrAxis : fAttrAxis; + // Shape of fShapeX[0, ..., fAxis) + fAxesShape = std::vector(fShapeX.begin(), fShapeX.begin() + fAxis); + // Length of the axes + fAxesLength = ConvertDimShapeToLength(fAxesShape); + // Shape of fShapeX[fAxis, ..., fSize) + fNormalizedShape = std::vector(fShapeX.begin() + fAxis, fShapeX.end()); + // Length of the normalized axis + fNormalizedLength = ConvertDimShapeToLength(fNormalizedShape); + // length of the input + fLength = ConvertDimShapeToLength(fShapeX); + // Type of mean and std + ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX); + // Mean + if (!fNMean.empty()) { + // cannot use initializer list with one element since it is ambiguous + if (isDynamic) + // add size_t(-1) to indicate that shape is an expression + model.AddIntermediateTensor(fNMean, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); + else + model.AddIntermediateTensor(fNMean, type, std::vector(1,std::stoi(fAxesLength))); + } + // Inverse Standard Deviation + if (!fNInvStdDev.empty()) { + if (isDynamic) + model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); + else + model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,std::stoi(fAxesLength))); + } + // if mean and stdev are not empty they are not defined in the output list + // Cast X to float + if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) { + fCastToFloat = true; + fType = "float"; + // fNCastedX = "Casted" + fNX; + // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); + // fNNormalizedX = "Normalized" + fNX; + // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); + } + // scale shape + fShapeScale = model.GetDimTensorShape(fNScale); + // appends 1 to scale shapes if missing + size_t dimScale = fShapeScale.size(); + if (dimScale < fSize) { + for (size_t i = 0; i < fSize-dimScale; i++) + fShapeScale.insert(fShapeScale.begin(), Dim{1}); + } + // check also shape if consistent now + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); + } + if (!fNB.empty()) { + fShapeB = model.GetDimTensorShape(fNB); + // appends 1 to bias shapes if missing + size_t dimB = fShapeB.size(); + if (dimB < fShapeX.size()) { + for (size_t i = 0; i < fSize-dimB; i++) + fShapeB.insert(fShapeB.begin(), Dim{1}); + } + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); + } + } + + std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << " " << ConvertDimShapeToString(fShapeScale) << std::endl; + + // // Broadcast the bias + // if (!fNB.empty()) { + // fShapeB = model.GetTensorShape(fNB); + // size_t lengthB = ConvertShapeToLength(fShapeB); + // if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { + // fNBroadcastedB = "Broadcasted" + fNB; + // model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); + // } + // } + model.AddNeededStdLib("cmath"); + } + + std::string GenerateInitCode() override + { + std::stringstream out; + if (!fNBroadcastedB.empty()) { + out << SP << "// Broadcasting the bias of LayerNormalization op\n"; + out << SP << "{\n"; + out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"; + out << fNB << ", " << ConvertDimShapeToString(fShapeB) << ", " << ConvertDimShapeToString(fShapeX) << ");\n"; + out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n"; + out << SP << "delete[] data;\n"; + out << SP << "}\n"; + } + return out.str(); + } + + std::string Generate(std::string opName) override + { + opName = "op_" + opName; + if (fShapeX.empty()) { + throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName + + " called to generate without being initialized first."); + } + + std::stringstream out; + + out << "//---- Layer Normalization operator " << opName << "\n"; + + // Loop over all the normalized axes i.e. [axis, ..., size) + std::vector inputShape(fSize); + + for (size_t i = 0; i < fSize; i++) { + inputShape[i] = fShapeX[i].GetVal(); + } + + auto strides = UTILITY::ComputeStrideFromShape(fShapeX); + std::string inputIndex = "axis_0 * " + strides[0].GetVal(); + for (size_t i = 1; i < fSize; i++) { + inputIndex += " + axis_" + std::to_string(i); + if (i < fSize-1) inputIndex += " * " + strides[i].GetVal(); + } + auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale); + std::string scaleIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1) { + if (!scaleIndex.empty()) scaleIndex += " + "; + scaleIndex += "axis_" + std::to_string(i); + if ( scaleStrides[i].dim != 1) scaleIndex += " * " + scaleStrides[i].GetVal(); + } + } + if (scaleIndex.empty()) scaleIndex = "0"; + + auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB); + std::string biasIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1) { + if (!biasIndex.empty()) biasIndex += " + "; + biasIndex += "axis_" + std::to_string(i); + if ( biasStrides[i].dim != 1) biasIndex += " * " + biasStrides[i].GetVal(); + } + } + if (biasIndex.empty()) biasIndex = "0"; + + auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); + std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal(); + for (size_t i = 1; i < fAxis; i++) { + axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal(); + } + + + // compute mean and std-dev. Save in tensors if requested + + out << SP << "// Compute the mean\n"; + + // Loop over all the outer dims in [0, fAxis) + for (size_t i = 0; i < fAxis; i++) { + std::string iIdx = "axis_" + std::to_string(i); + out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] + << "; " << iIdx << "++) {\n"; + } + out << SP << SP << fType << " mean = 0.;\n"; + // loop over the normalized dimensions (fAxis,....,N-1) + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] + << "; " << jIdx << "++) {\n"; + } + out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n"; + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; + } + out << SP << SP << "mean /= " << fType << "(" << fNormalizedLength << ");\n"; + + + out << SP << "// Compute the inverse Standard Deviation\n"; + + // Set sum = 0 + out << SP << SP << fType << " sum = 0.;\n"; + // loop over all the dims in [0, fAxis) + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] + << "; " << jIdx << "++){\n"; + } + out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n"; + out << SP << SP << SP << "sum += tmp*tmp;\n"; + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; + } + out << SP << SP << fType << " invStdDev = 1 / std::sqrt("; + out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n"; + + + // set output mean and invStdDev if requested + if (!fNMean.empty()) + out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n"; + if (!fNInvStdDev.empty()) + out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n"; + + // scale and add bias + + out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; + + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx + << "++){\n"; + } + out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale; + out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)"; + + // add bias if needed + if (!fNB.empty()) + // assume bias has index as scale + out << " + tensor_" << fNB << "[" << biasIndex << "]"; + out << ";\n"; + + // close loops on normalizing dim [..,fAxis,...fSize-1] + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; + } + // close loops on the other dimensions [0,...,fAxis] + for (size_t i = 0; i < fAxis; i++) { + out << SP << "}\n"; + } + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("TMVA::SOFIE LayerNormalization called to Generate without being initialized first"); + + // Each thread handles one "row" — one element of the axes dims [0..axis) + // and iterates over all normalized dims [axis..size) + // axesLength = product of fShapeX[0..axis) + // normalizedLength = product of fShapeX[axis..size) + // totalElements = axesLength (one thread per row) + + std::vector inputShape(fSize); + for (size_t i = 0; i < fSize; i++) + inputShape[i] = fShapeX[i].GetVal(); + + auto strides = UTILITY::ComputeStrideFromShape(fShapeX); + auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale); + auto biasStrides = (!fNB.empty()) ? UTILITY::ComputeStrideFromShape(fShapeB) + : std::vector{}; + auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); + + // Build index expressions reusing the same logic as Generate() + // input index: axis_0*stride0 + axis_1*stride1 + ... + norm_0*stride_axis + ... + // For the kernel we decompose the linear thread index into axis coords, + // then loop over normalized dims inside the kernel. + + std::string kname = "LayerNormKernel_" + opName; + std::string op; + op = "\n//------ LAYERNORM_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ X,\n"; + op += SP + SP + SP + "T const* __restrict__ scale,\n"; + if (!fNB.empty()) + op += SP + SP + SP + "T const* __restrict__ bias,\n"; + if (!fNMean.empty()) + op += SP + SP + SP + "T* __restrict__ out_mean,\n"; + if (!fNInvStdDev.empty()) + op += SP + SP + SP + "T* __restrict__ out_invstd,\n"; + op += SP + SP + SP + "T* __restrict__ Y,\n"; + op += SP + SP + SP + "std::size_t const axesLength) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= axesLength) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t row = global_thread_idx; row < axesLength; row += grid_thread_extent) {\n\n"; + + // Decompose row into per-axes-dim coords using compile-time strides + if (fAxis > 0) { + for (size_t i = 0; i < fAxis; ++i) { + op += SP + SP + SP + SP + "std::size_t const axis_" + std::to_string(i) + + " = (row / " + axesStrides[i].GetVal() + "u) % " + + inputShape[i] + "u;\n"; + } + op += "\n"; + } + + // Base input offset for this row (contribution from axes dims only) + op += SP + SP + SP + SP + "std::size_t const row_base =\n"; + if (fAxis == 0) { + op += SP + SP + SP + SP + SP + "0u;\n\n"; + } else { + for (size_t i = 0; i < fAxis; ++i) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + strides[i].GetVal() + "u"; + op += (i + 1 < fAxis) ? " +\n" : ";\n\n"; + } + } + + // Scale index base (from axes dims) + op += SP + SP + SP + SP + "std::size_t const scale_base =\n"; + { + bool any = false; + for (size_t i = 0; i < fAxis; ++i) { + if (fShapeScale[i].dim != 1) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + scaleStrides[i].GetVal() + "u"; + if (any) op = " +\n" + op; + any = true; + } + } + if (!any) op += SP + SP + SP + SP + SP + "0u"; + op += ";\n\n"; + } + + if (!fNB.empty()) { + op += SP + SP + SP + SP + "std::size_t const bias_base =\n"; + bool any = false; + for (size_t i = 0; i < fAxis; ++i) { + if (fShapeB[i].dim != 1) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + biasStrides[i].GetVal() + "u"; + if (any) op = " +\n" + op; + any = true; + } + } + if (!any) op += SP + SP + SP + SP + SP + "0u"; + op += ";\n\n"; + } + + // ---- Pass 1: compute mean ---- + op += SP + SP + SP + SP + "T mean = static_cast(0);\n"; + + // Unroll normalized dims loop for mean + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j) + + " = 0; n_" + std::to_string(j) + " < " + inputShape[j] + + "u; n_" + std::to_string(j) + "++) {\n"; + + // Normalized dim index + op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base"; + for (size_t j = fAxis; j < fSize; ++j) + op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u"; + op += ";\n"; + op += SP + SP + SP + SP + SP + "mean += X[norm_idx];\n"; + + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "}\n"; + + op += SP + SP + SP + SP + "mean /= static_cast(" + fNormalizedLength + ");\n\n"; + + // ---- Pass 2: compute variance ---- + op += SP + SP + SP + SP + "T sum = static_cast(0);\n"; + + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j) + + " = 0; n_" + std::to_string(j) + " < " + inputShape[j] + + "u; n_" + std::to_string(j) + "++) {\n"; + + op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base"; + for (size_t j = fAxis; j < fSize; ++j) + op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u"; + op += ";\n"; + op += SP + SP + SP + SP + SP + "T tmp = X[norm_idx] - mean;\n"; + op += SP + SP + SP + SP + SP + "sum += tmp * tmp;\n"; + + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "}\n"; + + op += SP + SP + SP + SP + "T const invStdDev = static_cast(1) / " + "alpaka::math::sqrt(acc, sum / static_cast(" + + fNormalizedLength + ") + static_cast(" + std::to_string(fAttrEpsilon) + "));\n\n"; + + // Save mean and invstd if requested + if (!fNMean.empty()) + op += SP + SP + SP + SP + "out_mean[row] = mean;\n"; + if (!fNInvStdDev.empty()) + op += SP + SP + SP + SP + "out_invstd[row] = invStdDev;\n"; + op += "\n"; + + // ---- Pass 3: normalize, scale, bias ---- + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j) + + " = 0; n_" + std::to_string(j) + " < " + inputShape[j] + + "u; n_" + std::to_string(j) + "++) {\n"; + + op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base"; + for (size_t j = fAxis; j < fSize; ++j) + op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u"; + op += ";\n"; + + // Scale index (normalized dims contribution) + op += SP + SP + SP + SP + SP + "std::size_t const s_idx = scale_base"; + for (size_t j = fAxis; j < fSize; ++j) { + if (fShapeScale[j].dim != 1) + op += " + n_" + std::to_string(j) + " * " + scaleStrides[j].GetVal() + "u"; + } + op += ";\n"; + + op += SP + SP + SP + SP + SP + "T val = scale[s_idx] * invStdDev * (X[norm_idx] - mean);\n"; + + if (!fNB.empty()) { + op += SP + SP + SP + SP + SP + "std::size_t const b_idx = bias_base"; + for (size_t j = fAxis; j < fSize; ++j) { + if (fShapeB[j].dim != 1) + op += " + n_" + std::to_string(j) + " * " + biasStrides[j].GetVal() + "u"; + } + op += ";\n"; + op += SP + SP + SP + SP + SP + "val += bias[b_idx];\n"; + } + + op += SP + SP + SP + SP + SP + "Y[norm_idx] = val;\n"; + + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "}\n"; + + op += SP + SP + SP + "}\n"; // end row loop + op += SP + SP + "}\n"; // end operator() + op += SP + "};\n"; // end struct + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "LayerNormKernel_" + opName; + return SP + kname + " layerNormKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("TMVA::SOFIE LayerNormalization called to Generate without being initialized first"); + + // One thread per row (per axes element) + // axesLength is known at generation time for static shapes + std::string axesLengthStr = fAxesLength; + // For static models fAxesLength is a number string; use it directly + // For dynamic models it may be a param expression — still valid in generated code + + std::string kname = "layerNormKernel_" + opName; + + std::stringstream out; + out << "\n//------ LAYERNORM_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << axesLengthStr << "});\n"; + // Build argument list + std::string args = + "alpaka::getPtrNative(deviceBuf_" + fNX + "), " + + "alpaka::getPtrNative(deviceBuf_" + fNScale + ")"; + if (!fNB.empty()) + args += ", alpaka::getPtrNative(deviceBuf_" + fNB + ")"; + if (!fNMean.empty()) + args += ", alpaka::getPtrNative(deviceBuf_" + fNMean + ")"; + if (!fNInvStdDev.empty()) + args += ", alpaka::getPtrNative(deviceBuf_" + fNInvStdDev + ")"; + args += ", alpaka::getPtrNative(deviceBuf_" + fNY + ")"; + args += ", static_cast(" + axesLengthStr + ")"; + + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname << ", " << args << ");\n"; + + return out.str(); + } + + std::vector GetBlasRoutines() override { return { std::string("Axpy") }; } + + std::vector GetStdLibs() override { return { std::string("cmath") }; } +}; + +} // namespace SOFIE + +#endif diff --git a/core/inc/SOFIE/ROperator_LeakyRelu.hxx b/core/inc/SOFIE/ROperator_LeakyRelu.hxx new file mode 100644 index 0000000..81fdb09 --- /dev/null +++ b/core/inc/SOFIE/ROperator_LeakyRelu.hxx @@ -0,0 +1,141 @@ +#ifndef SOFIE_ROPERATOR_LeakyRelu +#define SOFIE_ROPERATOR_LeakyRelu + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_LeakyRelu final : public ROperator +{ + +private: + + /* Attributes*/ + float falpha=0.01; //default value + std::string fNX; + std::string fNY; + std::vector fShape; + std::string fType; + +public: + ROperator_LeakyRelu(){} + ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY): + falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + { + fKind = OperatorKind::LEAKYRELU; + if(std::is_same::value){ + fType = "float"; + } + else{ + throw + std::runtime_error("SOFIE Encountered unsupported type parsing a Leaky Relu operator"); + } + + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Leaky Relu Op Input Tensor is not found in model"); + } + fShape = model.GetTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + } + + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Leaky Relu called to Generate without being initialized first"); + } + std::stringstream out; + size_t length = ConvertShapeToLength(fShape); + + out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; + + out << "\n//------ LEAKY RELU\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * tensor_"<< fNX<<"[id]);\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ LEAKY_RELU_KERNEL_ALPAKA\n"; + op += "struct LeakyReluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : alpha * data[idx];\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return "LeakyReluKernel leakyReluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator LeakyRelu called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ LEAKY_RELU_GPU_ALPAKA\n"; + out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNX + << ", leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "), " << OpName << "_alpha);\n"; + out << SP <<"alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "((" + v + " >= 0) ? " + v + " : " + std::to_string(falpha) + " * " + v + ")"; + } + + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_LeakyRelu diff --git a/core/inc/SOFIE/ROperator_Not.hxx b/core/inc/SOFIE/ROperator_Not.hxx new file mode 100644 index 0000000..c04ab1f --- /dev/null +++ b/core/inc/SOFIE/ROperator_Not.hxx @@ -0,0 +1,111 @@ +#ifndef TMVA_EXPERIMENTAL_SOFIE_ROPERATOR_NOT +#define TMVA_EXPERIMENTAL_SOFIE_ROPERATOR_NOT + +#include +#include +#include + + +namespace SOFIE { + + +class ROperator_Not final : public ROperator { +private: + std::string fNX; + std::string fNY; + + std::vector fShapeX; + std::vector fShapeY; + +public: + ROperator_Not() {} + + ROperator_Not(std::string nameX, std::string nameY) + : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); + } + fShapeX = model.GetDimTensorShape(fNX); + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + } + + std::string Generate(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + + out << SP << "\n//---- Operator Not " << opName << "\n"; + auto length = ConvertDimShapeToLength(fShapeX); + out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n"; + out << SP << SP << "tensor_" << fNY << "[i] = !tensor_" + fNX + "[i];\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override + { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ NOT_KERNEL_ALPAKA\n"; + op += SP + "struct NotKernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const & acc,\n"; + op += SP + SP + SP + "T const * data,\n"; + op += SP + SP + SP + "T * output,\n"; + op += SP + SP + SP + "std::size_t const length) const\n"; + op += SP + SP + "{\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < length) {\n"; + op += SP + SP + SP + SP + "output[idx] = !data[idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override + { + return SP + "NotKernel notKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + auto length = ConvertDimShapeToLength(fShapeX); + + out << "\n//------ " << opName << "_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << opName + << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", " << "notKernel" + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", " << length << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return !fIsOutputConstant; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "!" + v; + } + +}; + +} // namespace SOFIE + +#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx b/core/inc/SOFIE/ROperator_Pad.hxx similarity index 89% rename from src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx rename to core/inc/SOFIE/ROperator_Pad.hxx index dae3a5b..04365d8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx +++ b/core/inc/SOFIE/ROperator_Pad.hxx @@ -61,13 +61,13 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Pad Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Pad Op Input Tensor is not found in model"); } fInputShape = model.GetTensorShape(fNX); if (fMode != EMode::kConstant) { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only Constant mode"); + throw std::runtime_error("SOFIE Pad Op supports now only Constant mode"); } // get pads data @@ -75,7 +75,7 @@ public: if (model.IsInitializedTensor(fNP)) { padsData = static_cast(model.GetInitializedTensorData(fNP).get()); } else { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Pads data"); + throw std::runtime_error("SOFIE Pad Op supports now only initialized Pads data"); } // get constant value fConstantValue = 0; @@ -84,7 +84,7 @@ public: T * cData = static_cast(model.GetInitializedTensorData(fNCV).get()); fConstantValue = cData[0]; } else { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Constant Value data"); + throw std::runtime_error("SOFIE Pad Op supports now only initialized Constant Value data"); } } std::vector axes; @@ -103,10 +103,10 @@ public: for (size_t i = 0; i < nax; i++) axes[i] = data[i]; } else { - throw std::runtime_error("TMVA SOFIE Pad Op invalid input Axes type"); + throw std::runtime_error("SOFIE Pad Op invalid input Axes type"); } } else { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Axes data"); + throw std::runtime_error("SOFIE Pad Op supports now only initialized Axes data"); } } @@ -127,7 +127,7 @@ public: fPads[i].second = padsData[axesSize + i]; int64_t outDim = static_cast(fOutputShape[i]) + fPads[i].first + fPads[i].second; if (outDim < 0) - throw std::runtime_error("TMVA SOFIE Pad Op : invalid Pads values"); + throw std::runtime_error("SOFIE Pad Op : invalid Pads values"); fOutputShape[i] = outDim; } } @@ -149,7 +149,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fOutputShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Pad called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Pad called to Generate without being initialized first"); } std::stringstream out; auto inputStride = UTILITY::ComputeStrideFromShape(fInputShape); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx similarity index 95% rename from src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx rename to core/inc/SOFIE/ROperator_Pool.hxx index e6fbc25..8e11271 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx +++ b/core/inc/SOFIE/ROperator_Pool.hxx @@ -76,7 +76,7 @@ public: fType = "float"; } else { throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Pool operator"); + std::runtime_error("SOFIE Encountered unsupported type parsing a Pool operator"); } fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; @@ -94,19 +94,19 @@ public: // Where N is batch size, C : input channels, H : input height, W = input width // or it can be [N, C, F1,F2,....FN] . Minimum dimension is 3 if (input.size() != 1 ) { - throw std::runtime_error("TMVA SOFIE" + Name() + "Op Shape inference need 1 input tensor"); + throw std::runtime_error("SOFIE" + Name() + "Op Shape inference need 1 input tensor"); } if (input[0].size() < 3) { - throw std::runtime_error("TMVA SOFIE" + Name() + "Op Shape inference only accept tensor with at least 3 dimensions"); + throw std::runtime_error("SOFIE" + Name() + "Op Shape inference only accept tensor with at least 3 dimensions"); } // support only input tensors with dim = 3,4,5 if (input[0].size() < 3 || input[0].size() > 5) { - throw std::runtime_error("TMVA SOFIE" + Name() + "Op : tensors with dimension " + std::to_string(input[0].size()) + " are not yet supported"); + throw std::runtime_error("SOFIE" + Name() + "Op : tensors with dimension " + std::to_string(input[0].size()) + " are not yet supported"); } if (input[0].size() -2 != fDim) { throw - std::runtime_error("TMVA SOFIE Pool Op Shape inference - invalid inputs "); + std::runtime_error("SOFIE Pool Op Shape inference - invalid inputs "); } // kernel shape size_t k1 = ((fAttrKernelShape.empty())? input[0][2] : fAttrKernelShape[0]); @@ -156,7 +156,7 @@ public: } } else if (fAttrAutopad != "VALID") { throw - std::runtime_error("TMVA SOFIE" + Name() + "Op invalid Autopad value : " + fAttrAutopad); + std::runtime_error("SOFIE" + Name() + "Op invalid Autopad value : " + fAttrAutopad); } // to be sure pad is vector of size 6 if (fDim < 3) fAttrPads.resize(6, 0); @@ -204,13 +204,13 @@ public: if (!model.CheckIfTensorAlreadyExist(fNX)) { throw - std::runtime_error("TMVA SOFIE Pool op Input Tensor " + fNX + " is not found in model"); + std::runtime_error("SOFIE Pool op Input Tensor " + fNX + " is not found in model"); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() < 3 || fShapeX.size() > 5) { std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl; throw - std::runtime_error("TMVA SOFIE Pool Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); + std::runtime_error("SOFIE Pool Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); } fDim = fShapeX.size() - 2; // case of GlobalAveragePool. It is a pool case with kernel shape == image shape @@ -267,7 +267,7 @@ public: OpName = "op_" + OpName; if (fShapeX.empty() || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Pool Op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Pool Op called to Generate without being initialized first"); } std::stringstream out; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx b/core/inc/SOFIE/ROperator_RNN.hxx similarity index 98% rename from src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx rename to core/inc/SOFIE/ROperator_RNN.hxx index aed7bc1..3a0f58f 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx +++ b/core/inc/SOFIE/ROperator_RNN.hxx @@ -91,7 +91,7 @@ template class ROperator_RNN final : public ROperator { fType = "float"; } else { throw std::runtime_error( - "TMVA SOFIE Encountered unsupported type parsing a RNN operator"); + "SOFIE Encountered unsupported type parsing a RNN operator"); } fInputTensorNames = { fNX, fNW, fNR }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc b/core/inc/SOFIE/ROperator_RNN.icc similarity index 96% rename from src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc rename to core/inc/SOFIE/ROperator_RNN.icc index c03c1c2..467fda8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc +++ b/core/inc/SOFIE/ROperator_RNN.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_RNN_I #define SOFIE_ROPERATOR_RNN_I - namespace SOFIE { template @@ -39,40 +38,40 @@ auto ROperator_RNN::Initialize(RModel& model) fUseSession = model.UseSession(); // Check the input and output tensors if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNX + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNX + " is not found in model."); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNX + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNX + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNW + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNW + " is not found in model."); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNW + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNW + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNR)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNR + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNR + " is not found in model."); } fShapeR = model.GetTensorShape(fNR); if (fShapeR.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNR + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNR + " is not of 3 dimensions."); } if (!fNB.empty()) { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE RNN op input tensor " + fNB + + throw std::runtime_error("SOFIE RNN op input tensor " + fNB + " is not found in model."); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() != 2 && fShapeB.size() != 4) { - throw std::runtime_error("TMVA SOFIE RNN op input tensor " + fNB + + throw std::runtime_error("SOFIE RNN op input tensor " + fNB + " is not of 2 or 4 dimensions."); } if (fShapeB.size() == 2) { @@ -112,23 +111,23 @@ auto ROperator_RNN::Initialize(RModel& model) } if (!fNSequence_lens.empty()) { if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNSequence_lens + "is not found in model."); } fShapeSequence_lens = model.GetTensorShape(fNSequence_lens); if (fShapeSequence_lens.size() != 1) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNSequence_lens + " is not of 1 dimension."); } } if (!fNInitial_h.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNInitial_h + " is not found in model."); } fShapeInitial_h = model.GetTensorShape(fNInitial_h); if (fShapeInitial_h.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNInitial_h + " is not of 3 dimensions."); } } @@ -153,24 +152,24 @@ auto ROperator_RNN::Initialize(RModel& model) activation != "ScaledTanh" && activation != "HardSigmoid" && activation != "Elu" && activation != "Softsign" && activation != "Softplus") { - throw std::runtime_error("TMVA SOFIE - Activation function " + + throw std::runtime_error("SOFIE - Activation function " + activation + " not implemented"); } } if (fAttrDirection != "forward" && fAttrDirection != "backward" && fAttrDirection != "bidirectional") { throw std::runtime_error( - "TMVA SOFIE - Invalid RNN direction fAttrDirection = " + + "SOFIE - Invalid RNN direction fAttrDirection = " + fAttrDirection); } if (fAttrHiddenSize != fShapeW[1]) { throw std::runtime_error( - "TMVA SOFIE - fAttrHiddenSize must be equal to " + + "SOFIE - fAttrHiddenSize must be equal to " + std::to_string(fShapeW[1])); } if (fAttrLayout > 1) { throw std::runtime_error( - "TMVA SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + + "SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + " must be 0 (timewise) or 1 (batchwise)"); } if (fAttrActivations.empty()) { @@ -230,7 +229,7 @@ auto ROperator_RNN::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { if (fType == "float") { - out << SP << "float *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << "float const*" << OpName << "_input = tensor_" << fNX << ";\n"; } } else { if (fUseSession) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx b/core/inc/SOFIE/ROperator_Random.hxx similarity index 95% rename from src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx rename to core/inc/SOFIE/ROperator_Random.hxx index cde08b5..0de1cd9 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx +++ b/core/inc/SOFIE/ROperator_Random.hxx @@ -121,13 +121,13 @@ public: if (fUseROOT) { if (fMode == kNormal) { if (fParams.count("mean") == 0 || fParams.count("scale") == 0) - throw std::runtime_error("TMVA SOFIE RandomNormal op : no mean or scale are defined"); + throw std::runtime_error("SOFIE RandomNormal op : no mean or scale are defined"); float mean = fParams["mean"]; float scale = fParams["scale"]; out << SP << SP << "tensor_" << fNY << "[i] = fRndmEngine->Gaus(" << mean << "," << scale << ");\n"; } else if (fMode == kUniform) { if (fParams.count("high") == 0 || fParams.count("low") == 0) - throw std::runtime_error("TMVA SOFIE RandomUniform op : no low or high are defined"); + throw std::runtime_error("SOFIE RandomUniform op : no low or high are defined"); float high = fParams["high"]; float low = fParams["low"]; out << SP << SP << "tensor_" << fNY << "[i] = fRndmEngine->Uniform(" << low << "," << high << ");\n"; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx b/core/inc/SOFIE/ROperator_Range.hxx similarity index 84% rename from src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx rename to core/inc/SOFIE/ROperator_Range.hxx index 8af272d..8ea17d9 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx +++ b/core/inc/SOFIE/ROperator_Range.hxx @@ -8,7 +8,6 @@ #include #include - namespace SOFIE{ template @@ -51,15 +50,15 @@ public: //input must be a graph input, or already initialized intermediate tensor if (!model.CheckIfTensorAlreadyExist(fNStart)) { throw - std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNStart + "is not found in model"); + std::runtime_error("SOFIE Range Op Input Tensor " + fNStart + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNLimit)) { throw - std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNLimit + "is not found in model"); + std::runtime_error("SOFIE Range Op Input Tensor " + fNLimit + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNDelta)) { throw - std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNDelta + "is not found in model"); + std::runtime_error("SOFIE Range Op Input Tensor " + fNDelta + "is not found in model"); } ETensorType type = ConvertStringToType(fType); if (model.IsInitializedTensor(fNStart) && model.IsInitializedTensor(fNDelta) && model.IsInitializedTensor(fNLimit)) { @@ -67,7 +66,7 @@ public: T * limit = static_cast(model.GetInitializedTensorData(fNLimit).get()); T * delta = static_cast(model.GetInitializedTensorData(fNDelta).get()); if (!start || !delta || !limit) - std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input data"); + std::runtime_error("SOFIE Range Op Input Tensor has invalid input data"); T a = *start; T b = *limit; T d = *delta; @@ -89,9 +88,9 @@ public: model.AddDynamicTensor(fNOutput, type, fShape); } if (model.Verbose()) { - std::cout << "Range -> output is " << fNOutput << " "; - if (fIsOutputConstant) std::cout << ConvertDynamicShapeToString(fShape) << std::endl; - else std::cout << ConvertDynamicShapeToString(model.GetDynamicTensorShape(fNOutput)) << std::endl; + std::cout << "Range -> output is " << fNOutput << " : " << ConvertDimShapeToString(fShape); + if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData(fNOutput)); + std::cout << std::endl; } } @@ -103,7 +102,7 @@ public: OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Range operator called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Range operator called to Generate without being initialized first"); } std::string sizeName = fShape[0].param; @@ -121,5 +120,5 @@ public: }; }//SOFIE - + #endif //SOFIE_ROPERATOR_RANGE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx b/core/inc/SOFIE/ROperator_Reduce.hxx similarity index 67% rename from src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx rename to core/inc/SOFIE/ROperator_Reduce.hxx index 886aef1..e0d2b7b 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx +++ b/core/inc/SOFIE/ROperator_Reduce.hxx @@ -68,7 +68,7 @@ public: for (size_t j = 0; j < fAttrAxes.size(); j++) { if (fAttrAxes[j] < 0) fAttrAxes[j] += outputShape.size(); if (fAttrAxes[j] < 0 || (size_t) fAttrAxes[j] >= outputShape.size() ) - throw std::runtime_error("TMVA SOFIE Reduce Op - invalid axes values " + std::to_string(fAttrAxes[j])); + throw std::runtime_error("SOFIE Reduce Op - invalid axes values " + std::to_string(fAttrAxes[j])); // set to 1 the reduced dims outputShape[fAttrAxes[j]] = 1; } @@ -94,7 +94,7 @@ public: if (!model.CheckIfTensorAlreadyExist(fNX)) { // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Reduce Op Input Tensor " + fNX + " is not found in model"); + throw std::runtime_error("SOFIE Reduce Op Input Tensor " + fNX + " is not found in model"); } fShapeX = model.GetTensorShape(fNX); // check if tensor with axes is provided @@ -122,7 +122,7 @@ public: std::string Generate(std::string opName) override { opName = "op_" + opName; if (fShapeX.empty() || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Reduce Op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first"); } size_t inputLength = SOFIE::ConvertShapeToLength(fShapeX); @@ -261,6 +261,123 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + if (fShapeX.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first"); + + const std::size_t Dx = fShapeX.size(); + + auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeX); + auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeYNotPruned); + + std::size_t inputLength = ConvertShapeToLength(fShapeX); + std::size_t outputLength = ConvertShapeToLength(fShapeY); + std::size_t reducedLength = inputLength / outputLength; + + std::string kname = "ReduceKernel_" + Name() + "_" + fNY; + + std::string op; + op = "\n//------ " + Name() + "_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const outputLength) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= outputLength) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t out_idx = global_thread_idx; out_idx < outputLength; out_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < Dx; ++d) { + op += SP + SP + SP + SP + "std::size_t const oy_" + std::to_string(d) + + " = (out_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fShapeYNotPruned[d]) + "u;\n"; + } + op += "\n"; + + std::string startVal = (Op == ReduceProd) ? "static_cast(1)" : "static_cast(0)"; + op += SP + SP + SP + SP + "T acc_val = " + startVal + ";\n\n"; + + std::vector redAxes; + std::vector keepAxes; + for (std::size_t d = 0; d < Dx; ++d) { + if (std::find(fAttrAxes.begin(), fAttrAxes.end(), (int64_t)d) != fAttrAxes.end()) + redAxes.push_back(d); + else + keepAxes.push_back(d); + } + + std::string indent = SP + SP + SP + SP; + for (std::size_t rd : redAxes) { + op += indent + "for (std::size_t r_" + std::to_string(rd) + + " = 0; r_" + std::to_string(rd) + + " < " + std::to_string(fShapeX[rd]) + "u; r_" + + std::to_string(rd) + "++) {\n"; + indent += SP; + } + + op += indent + "std::size_t const in_idx =\n"; + for (std::size_t d = 0; d < Dx; ++d) { + std::string coord = (std::find(redAxes.begin(), redAxes.end(), d) != redAxes.end()) + ? "r_" + std::to_string(d) + : "oy_" + std::to_string(d); + op += indent + SP + coord + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < Dx) ? " +\n" : ";\n"; + } + + if (Op == ReduceProd) + op += indent + "acc_val *= input[in_idx];\n"; + else if (Op == ReduceSum || Op == ReduceMean) + op += indent + "acc_val += input[in_idx];\n"; + else if (Op == ReduceSumSquare) + op += indent + "acc_val += input[in_idx] * input[in_idx];\n"; + + for (std::size_t i = 0; i < redAxes.size(); ++i) { + indent = indent.substr(SP.length()); + op += indent + "}\n"; + } + + if (Op == ReduceMean) + op += SP + SP + SP + SP + "acc_val /= static_cast(" + std::to_string(reducedLength) + "u);\n"; + + op += SP + SP + SP + SP + "output[out_idx] = acc_val;\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + std::string kname = "ReduceKernel_" + Name() + "_" + fNY; + return SP + kname + " reduceKernel_" + Name() + "_" + fNY + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string /*opName*/) override { + if (fShapeX.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first"); + + std::size_t outputLength = ConvertShapeToLength(fShapeY); + std::string kname = "reduceKernel_" + Name() + "_" + fNY; + + std::stringstream out; + out << "\n//------ " << Name() << "_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << outputLength << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNY + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << outputLength << "));\n"; + + return out.str(); + } + }; }//SOFIE diff --git a/core/inc/SOFIE/ROperator_Relu.hxx b/core/inc/SOFIE/ROperator_Relu.hxx new file mode 100644 index 0000000..96d5931 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Relu.hxx @@ -0,0 +1,130 @@ +#ifndef SOFIE_ROPERATOR_RELU +#define SOFIE_ROPERATOR_RELU + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Relu final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + +public: + ROperator_Relu(){} + ROperator_Relu(std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::RELU; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Relu Op Input Tensor " + fNX + " is not found in model"); + } + + fShape = model.GetDimTensorShape(fNX); + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + if (model.Verbose()) { + std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl; + } + } + + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Relu called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + out << "\n//------ RELU\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { + std::string op; + op = "\n//------ RELU_KERNEL_ALPAKA\n"; + + op = "\n//------ RELU_KERNEL_ALPAKA\n"; + op += "struct ReluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : 0;\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "ReluKernel reluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Relu called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + out << "\n//------ RELU_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY + << ", reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "(" + v + ") >= T(0) ? (" + v + ") : T(0)"; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_RELU diff --git a/core/inc/SOFIE/ROperator_Reshape.hxx b/core/inc/SOFIE/ROperator_Reshape.hxx new file mode 100644 index 0000000..56554dd --- /dev/null +++ b/core/inc/SOFIE/ROperator_Reshape.hxx @@ -0,0 +1,444 @@ +#ifndef SOFIE_ROPERATOR_RESHAPE +#define SOFIE_ROPERATOR_RESHAPE + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include + + +namespace SOFIE{ + +enum ReshapeOpMode { Reshape, Flatten, Squeeze, Unsqueeze }; + + +class ROperator_Reshape final : public ROperator +{ + +private: + + bool fVerbose = false; + bool fDimInput = false; + bool fDynamicShape = false; + ReshapeOpMode fOpMode = Reshape; // type of Reshape operator + + int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape + int fAxis = 1; // (for Flatten) + + std::string fNData; // input data tensor name + std::string fNInput2; // reshape or axes tensor name depending on operator + std::string fNOutput; // output tensor name + std::vector fShapeInput; // input shape data + std::vector fShapeOutput; // output shape data + std::vector fOutputShapeData; // in case output is a shape tensor we store here the shape value data (can be parametric) + std::vector fAttrAxes; // axes attributes (provided for all version of Squeeze/Unsqueeze) + std::vector fShape; // shape tensor values provided for Reshape for int shapes4 + +public: + + std::string Name() const { + if (fOpMode == Reshape) return "Reshape"; + if (fOpMode == Flatten) return "Flatten"; + if (fOpMode == Squeeze) return "Squeeze"; + if (fOpMode == Unsqueeze) return "Unsqueeze"; + return ""; + } + + ROperator_Reshape(){} + ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameInput2, std::string nameOutput) + : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNInput2(UTILITY::Clean_name(nameInput2)), + fNOutput(UTILITY::Clean_name(nameOutput)) + { + if (opMode == Reshape) fAllowZero = attr_value; + if (opMode == Flatten) fAxis = attr_value; + + fInputTensorNames = { fNData }; + if(!fNInput2.empty()){ + fInputTensorNames.emplace_back(fNInput2); + } + fOutputTensorNames = { fNOutput }; + } + + // for squeeze/unsqueezed operators following old ONNX version (< 10) + // In this cases axes are passed as attribute values + ROperator_Reshape(ReshapeOpMode opMode, std::vector attrAxes, std::string nameData, std::string nameOutput) + : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)), + fAttrAxes(attrAxes) + { + assert(fOpMode == Squeeze || fOpMode == Unsqueeze); + fInputTensorNames = { fNData }; + fOutputTensorNames = { fNOutput }; + } + + + // output shape + std::vector DoShapeInference(const std::vector & input_shape, const std::vector & target_shape) { + if (fOpMode == Reshape) { + // correct the provided shape (here we have the value) for 0 or -1 + // the target_shape can be a scalar in case of not present shape input tensor + std::vector output_shape = target_shape; + bool hasMinusOne = false; + bool hasZero = false; + for (size_t i = 0; i < output_shape.size(); i++) { + // case for zero values in given shape: in this case we take the corresponding value from input shape + if (!output_shape[i].isParam) { + if (output_shape[i].dim == 0) { + hasZero = true; + if (fAllowZero) + output_shape[i] = Dim{0}; + else { + if (i > 0 && output_shape.size() != input_shape.size()) + std::cout << "WARNING: TMVA Reshape Op : output shape has zero value at index " << i << + " but input shape has a different rank than output shape" << std::endl; + if (i >= input_shape.size()) + throw std::runtime_error("TMVA Reshape Op : output shape has zero value at index " + std::to_string(i) + + " but input shape does not have corresponding index"); + } + output_shape[i] = input_shape[i]; + } else if (output_shape[i].dim == static_cast(-1)) { + hasMinusOne = true; + } + } + } + if (hasZero && hasMinusOne) { + throw std::runtime_error("TMVA Reshape Op : zero value in shape is not allowed when there is also a -1 in shape"); + } + // now case of -1 in shape - we can infer the value of -1 from all other values + for (size_t i = 0; i < output_shape.size(); i++) { + if (output_shape[i] == static_cast(-1) && !output_shape[i].isParam) { + auto tmp = output_shape; + tmp.erase(tmp.begin() + i); // erase -1 value to compute the length of the other dimensions + auto tmp_length = ConvertDimShapeToLength(tmp); + auto input_length = ConvertDimShapeToLength(input_shape); + if (fVerbose) + std::cout << "reshape- try simplifying " << ConvertDimShapeToString(input_shape) << " with length " + << input_length << " to " << tmp_length << std::endl; + + if (IsInteger(tmp_length) && IsInteger(input_length)) + output_shape[i] = Dim{static_cast(std::stoi(input_length) / std::stoi(tmp_length))}; + else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) { + output_shape[i] = Dim{input_length, static_cast(-1)}; + } + else { + //we can try simplifying expression if tmp_length is integer and part of input_length + // contains tmp_length + bool canSimplify = false; + std::vector reduced_input; + if (IsInteger(tmp_length)) { + + // try to tokenize with * the input length + + std::stringstream ss(input_length); + + std::string token; + + // Tokenizing w.r.t. space '*' + while(getline(ss, token, '*')) + { + // remove any whitespace + token.erase(std::remove_if(token.begin(), token.end(), + [](unsigned char x) { return std::isspace(x); }), token.end()); + if (token != tmp_length) { + if (IsInteger(token)) { + size_t il = static_cast(std::stoi(input_length)); + size_t tl = static_cast(std::stoi(tmp_length)); + if ((il % tl) == 0) { + canSimplify = true; + reduced_input.push_back(Dim{il / tl}); + } + } else { + reduced_input.push_back(Dim{token}); + } + } else { + // token is equal to tmp_length, can be not considered and is simplified + canSimplify = true; + } + } + } + if (canSimplify) { + // if length contains * we need to add some brackets + std::string res_shape = ConvertDimShapeToLength(reduced_input); + if (res_shape.find('*') != std::string::npos) + output_shape[i] = Dim{std::string("(") + res_shape + ")", static_cast(-1)}; + else + output_shape[i] = Dim{res_shape}; + } + if (!canSimplify) + output_shape[i] = Dim{std::string("(") + input_length + " / (" + tmp_length + "))", static_cast(-1)}; + } + + break; // cannot have more than -1 + } + // throw std::runtime_error( + // "TMVA Reshape Op : output shape has multiple negative or zero values"); + } + + if (fVerbose) + std::cout << "Reshape: correct output shape to " << ConvertDimShapeToString(output_shape) << std::endl; + + if (!fDimInput && ConvertDimShapeToLength(output_shape) != ConvertDimShapeToLength(input_shape)) { + throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertDimShapeToString(input_shape) + + ConvertDimShapeToString(output_shape)); + } + return output_shape; + + } else if (fOpMode == Flatten) { + // flatten case + if (fAxis < 0) + fAxis += input_shape.size(); + auto s1 = std::vector(input_shape.begin(), input_shape.begin() + fAxis); + auto s2 = std::vector(input_shape.begin() + fAxis, input_shape.end()); + auto l1 = ConvertDimShapeToLength(s1); + auto l2 = ConvertDimShapeToLength(s2); + std::vector newShape = {Dim{l1}, Dim{l2}}; + return newShape; + } else if (fOpMode == Squeeze) { + // squeeze + // assume no axis is provided - remove all axes with value equal to 1 + auto output_shape = input_shape; + if (fAttrAxes.empty()) { + size_t i = 0; + while (i < output_shape.size()) { + if (output_shape[i] == Dim{1}) { + output_shape.erase(output_shape.begin() + i); + } else { + i++; + } + } + } else { + auto axes = fAttrAxes; + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] < 0) + axes[i] += input_shape.size(); + if (!(output_shape[axes[i]] == Dim{1})) + throw std::runtime_error("TMVA Squeeze Op : Invalid axis value " + std::to_string(axes[i]) + + " for " + ConvertDimShapeToString(output_shape)); + } + // for calling vector::erase we must sort axes in decreasing order to avoid + std::sort(axes.begin(), axes.end(), std::greater()); + for (auto & axis : axes) { + output_shape.erase(output_shape.begin() + axis); + } + } + return output_shape; + } + else if (fOpMode == Unsqueeze) { + // unsqueeze + assert(!fAttrAxes.empty()); + auto output_shape = input_shape; + auto &axes = fAttrAxes; + // output rank + int64_t r = input_shape.size() + axes.size(); + for (auto &a : axes) { + int64_t i = static_cast(a); + if (i < -r || i > r - 1) + throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range"); + if (i >= 0) + output_shape.insert(output_shape.begin() + i, Dim{1}); + else + // negative axes + output_shape.insert(output_shape.end() + i + 1, Dim{1}); + } + return output_shape; + } + throw std::runtime_error("TMVA Reshape Op : Invalid ReshapeOpMode"); + return {Dim{}}; + } + + void Initialize(RModel& model) override { + + fVerbose = model.Verbose(); + if (fVerbose) + std::cout << "initialize reshape op type " << fOpMode << " - for input " << fNData + << " to shape given by " << fNInput2 << std::endl; + + if (model.CheckIfTensorAlreadyExist(fNData) == false) { + // input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + " is not found in model"); + } + fShapeInput = model.GetDimTensorShape(fNData); + fDimInput = model.IsDynamicTensor(fNData); + // check if optional tensor exists defining shape or axes + if (!fNInput2.empty()) { + if (model.CheckIfTensorAlreadyExist(fNInput2)) { + if (model.IsInitializedTensor(fNInput2)) { + // assume input shape is an initialized tensor + auto dptr = model.GetInitializedTensorData(fNInput2); + auto values = static_cast(dptr.get()); + auto vec = model.GetTensorShape(fNInput2); + size_t n = 1; + if (vec.size() > 0) + n = vec[0]; // size of shape input tensor + // copy values in fShape vector or fAttrAxes + if (fOpMode == Reshape) + fShape = std::vector(values, values + n); + else + fAttrAxes = std::vector(values, values + n); + + std::vector targetShape(fShape.begin(),fShape.end()); + fShapeOutput = DoShapeInference(fShapeInput, targetShape); + // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed + model.SetNotWritableInitializedTensor(fNInput2); + } else if (model.IsShapeTensor(fNInput2)) { + auto shapeData = model.GetShapeTensorValues(fNInput2); + fShapeOutput = DoShapeInference(fShapeInput, shapeData); + if (model.Verbose()) + std::cout << "Reshape op - get output shape from shape tensor " << fNInput2 << " with value " << ConvertDimShapeToString(shapeData) << std::endl; + } else { + // we cannot get shape at initialization time but at run-time + fDynamicShape = true; + // size of shape output us given by size of shape input tensor + if (model.IsDynamicTensor(fNInput2)) { + throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " cannot have dynamic shape"); + } + auto shapeInput2 = model.GetTensorShape(fNInput2); + fShapeOutput.resize(shapeInput2[0]); + for (size_t i = 0; i < fShapeOutput.size(); i++) { + fShapeOutput[i] = Dim{ std::string("s_") + fNOutput + "_" + std::to_string(i)}; + } + } + } else { + throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " is not found in model"); + } + } else if (!fAttrAxes.empty()) { + // case fNShape is empty and axes are provided as attributes (e.g. for Unsqueeze) + fShapeOutput = DoShapeInference(fShapeInput, std::vector{}); + } else if (fOpMode == Flatten || fOpMode == Squeeze) { + fShapeOutput = DoShapeInference(fShapeInput, std::vector{}); + } else { + throw std::runtime_error("TMVA Reshape Op : Invalid Input/Attribute data"); + } + // check if output is constant or not + if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { + fIsOutputConstant = true; + auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); + auto o_shape = ConvertShapeToInt(fShapeOutput); + if (ConvertShapeToLength(ConvertShapeToInt(fShapeInput)) != ConvertShapeToLength(o_shape) ) + throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths"); + model.AddConstantTensor(fNOutput, o_shape, inputData); + if (model.Verbose()) { + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> " << fNOutput << " (constant) " << ConvertDimShapeToString(fShapeOutput) << " : " << + ConvertValuesToString(ConvertShapeToLength(o_shape), inputData) << std::endl; + } + } + // for input shape tensors we can have it if output shape is size==1 or a scalar + else if (model.IsShapeTensor(fNData) && fShapeOutput.size() <=1) { + // not sure if we ever end-up here - maybe reshaping from scalar to vector or viceversa + fIsOutputParamShape = true; + fOutputShapeData = model.GetShapeTensorValues(fNData); + model.AddShapeTensor(fNOutput, fOutputShapeData); + if (model.Verbose()) { + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> " << fNOutput << " (shape) " << ConvertDimShapeToString(fShapeOutput) << " : " << + ConvertDimShapeToString(fOutputShapeData) << std::endl; + } + } + else { + // non-constant case + model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); + if (model.Verbose()) + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> "<< fNOutput << " " << ConvertDimShapeToString(fShapeOutput) << std::endl; + } + } + + std::string Generate(std::string opName) override { + + + std::stringstream out; + std::string opType = "Reshape"; + if (fOpMode == Flatten) + opType = "Flatten"; + else if (fOpMode == Squeeze) + opType = "Squeeze"; + else if (fOpMode == Unsqueeze) + opType = "Unsqueeze"; + + out << SP << "///--------" << opType << " operator " << opName << " --> " << ConvertDimShapeToString(fShapeOutput) << "\n"; + + if (fIsOutputConstant) return out.str(); //no op for constant tensors + + if (fIsOutputParamShape) { + // no code to generate here for param shape output. Tensor output is defined in Session constructor + out << "//----------------output is a shape tensor----------\n"; + for (int i = 0; i < static_cast(fShapeOutput[0].dim); i++) { + out << SP << "tensor_" << fNOutput << "[" << i << " ] = " << fOutputShapeData[i].GetVal() << ";\n"; + } + return out.str(); + } + + // in case of dynamic output shape we need to set the shape value from input shape tensor + // and take case of the zero values + if (fDynamicShape) { + for (size_t i = 0; i < fShapeOutput.size(); i++) { + // since fNInput2 values are int64_t, should we check if they are negative? + out << SP << "size_t " << fShapeOutput[i].param << " = " << "tensor_" << fNInput2 << "[" << i << "];\n"; + if (!fAllowZero) + out << SP << "if (tensor_" << fNInput2 << "[" << i << "] <= 0 ) " + << fShapeOutput[i].param << " = " << fShapeInput[i] << ";\n"; + } + } + + // output of reshape is same as input + auto lengthOut = ConvertDimShapeToLength(fShapeOutput); + auto lengthIn = ConvertDimShapeToLength(fShapeInput); + if (lengthOut != lengthIn) { + // check needs to be done at run-time + out << SP << "if (" << lengthOut << "!=" << lengthIn << ")\n"; + out << SP << SP << "throw std::runtime_error(\"SOFIE Reshape " << opName << " output length " + << lengthOut << " is different than input one " << lengthIn << "\");\n"; + } + + + out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << lengthIn << ", " << "tensor_" << fNOutput + << ");\n"; + return out.str(); + } + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + + opName = "op_" + opName; + + if (fIsOutputParamShape) { + // shape tensor output: fill host-side tensor values, no device copy needed + std::stringstream out; + for (int i = 0; i < static_cast(fShapeOutput[0].dim); i++) { + out << SP << "tensor_" << fNOutput << "[" << i << "] = " << fOutputShapeData[i].GetVal() << ";\n"; + } + return out.str(); + } + + std::string opType = "Reshape"; + if (fOpMode == Flatten) opType = "Flatten"; + else if (fOpMode == Squeeze) opType = "Squeeze"; + else if (fOpMode == Unsqueeze) opType = "Unsqueeze"; + + std::stringstream out; + out << SP << "///------- " << opType << " operator " << opName << "\n"; + + if (fDynamicShape) { + auto lengthOut = ConvertDimShapeToLength(fShapeOutput); + auto lengthIn = ConvertDimShapeToLength(fShapeInput); + if (lengthOut != lengthIn) { + out << SP << "if (" << lengthOut << " != " << lengthIn << ")\n"; + out << SP << SP << "throw std::runtime_error(\"SOFIE " << opType + << " Op : output length is different from input length\");\n"; + } + } + + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNOutput + << ", deviceBuf_" << fNData << ");\n"; + out << SP << "alpaka::wait(queue);\n"; + + return out.str(); +} + +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_RESHAPE diff --git a/core/inc/SOFIE/ROperator_ScatterElements.hxx b/core/inc/SOFIE/ROperator_ScatterElements.hxx new file mode 100644 index 0000000..b69ee71 --- /dev/null +++ b/core/inc/SOFIE/ROperator_ScatterElements.hxx @@ -0,0 +1,279 @@ +#ifndef SOFIE_ROperator_ScatterElements +#define SOFIE_ROperator_ScatterElements + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + + +class ROperator_ScatterElements final : public ROperator{ +private: + + int64_t fAxis; + + std::string fNX; + std::string fNI; + std::string fNU; + std::string fNY; + std::string fReduction; + + std::vector fShapeX; + std::vector fShapeI; + std::vector fShapeY; + + // define reduction function. Possibilities are: + // none (default), add, mul, max, min + std::string ReductionFunction(const std::string & t1, const std::string & t2 ) { + std::string name = fReduction; + if (name.empty() || name == "none") + return t2; + else if (name == "add") + return t1 + " + " + t2; + else if (name == "mul") + return t1 + " * " + t2; + else if (name == "max") + return "std::max(" + t1 + "," + t2 + ")"; + else if (name == "min") + return "std::min(" + t1 + "," + t2 + ")"; + else + throw std::runtime_error("SOFIE ScatterElements : invalid reduction attribute"); + + return std::string(); + } + +public: + ROperator_ScatterElements(){} + ROperator_ScatterElements(const std::string & nameX, const std::string & nameI, const std::string & nameU, const std::string & nameY, + int axis, std::string reduction): + fAxis(axis), + fNX(UTILITY::Clean_name(nameX)), fNI(UTILITY::Clean_name(nameI)), fNU(UTILITY::Clean_name(nameU)), + fNY(UTILITY::Clean_name(nameY)), + fReduction(reduction) + { + fInputTensorNames = { fNX, fNI, fNU }; + fOutputTensorNames = { fNY }; + } + + // type of output given input + std::vector TypeInference(std::vector input) override { + return input; + } + + // shape of output tensors given input tensors + std::vector> ShapeInference(std::vector> input) override { + auto ret = std::vector>(1, input[0]); // return vector size 1 with first input + return ret; + } + + void Initialize(RModel& model) override { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNX)){ + throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNX + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNI)) { + throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNI + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNU)) { + throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNU + "is not found in model"); + } + //tbd check for constant tensors + + fShapeX = model.GetTensorShape(fNX); + fShapeI = model.GetTensorShape(fNI); + if (model.GetTensorShape(fNU) != fShapeI) + throw std::runtime_error(std::string("SOFIE ScatterElements - update tensor has invalid shape ")) ; + if (fShapeX.size() == 0) + throw std::runtime_error(std::string("SOFIE ScatterElements - input tensor has zero rank ")) ; + if (fShapeX.size() != fShapeI.size()) + throw std::runtime_error(std::string("SOFIE ScatterElements - index tensor has invalid rank ")) ; + + if (fAxis < 0) fAxis += fShapeX.size(); + + // assume output shape is identical to input shape + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + } + + std::string GenerateInitCode() override { + std::stringstream out; + return out.str(); + } + + std::string Generate(std::string opName) override { + + if (fIsOutputConstant) return ""; + + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first"); + } + std::stringstream out; + out << SP << "\n//-------- ScatterElements --- " << opName << "\n"; + + auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); + auto strideI = UTILITY::ComputeStrideFromShape(fShapeI); + + size_t length = ConvertShapeToLength(fShapeY); + + // function to write compute expression for global index from axes indices + auto tensorIndex = [](const std::vector & stride, const std::vector & idx) { + std::stringstream strst; + int dims = idx.size(); + assert (dims == (int) stride.size()); + for (int i = 0; i < dims; i++) { + if (stride[i] != 1) + strst << stride[i] << "*" << idx[i]; + else + strst << idx[i]; + if (i < dims-1) + strst << " + "; + } + return strst.str(); + }; + + + // copy first input in output (maybe can be avoided??) + out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; + + // loop on tensor rank + int dims = fShapeY.size(); + std::vector idx(dims); + for (int i = 0; i < dims; i++) { + idx[i] = std::string("i") + std::to_string(i); + for (int j = 0; j <= i; j++) out << SP; + out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i] << "; " << idx[i] << "++) {\n"; + } + // correct index for specific axis + for (int j = 0; j <= dims; j++) out << SP; + out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n"; + for (int j = 0; j <= dims; j++) out << SP; + out << "int iAxis = tensor_" << fNI << "[updateIndex];\n"; + for (int j = 0; j <= dims; j++) out << SP; + out << "if (iAxis < 0) iAxis += " << fShapeY[fAxis] << ";\n"; + idx[fAxis] = "iAxis"; + for (int j = 0; j <= dims; j++) out << SP; + out << "int outIndex = " << tensorIndex(strideY, idx) << ";\n"; + for (int j = 0; j <= dims; j++) out << SP; + out << "tensor_" << fNY << "[outIndex] = " + << ReductionFunction(std::string("tensor_") + fNY + "[outIndex]", std::string("tensor_") + fNU + "[updateIndex]") << ";\n"; + + for (int i = dims; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first"); + } + + const std::size_t D = fShapeI.size(); + + auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); + auto strideI = UTILITY::ComputeStrideFromShape(fShapeI); + + std::size_t totalElements = 1; + for (std::size_t d = 0; d < D; ++d) + totalElements *= fShapeI[d]; + + std::string op; + op = "\n//------ SCATTERELEMENTS_KERNEL_ALPAKA\n"; + op += SP + "struct ScatterElementsKernel_" + opName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T* Y,\n"; + op += SP + SP + SP + "int64_t const* I,\n"; + op += SP + SP + SP + "T const* U,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "std::size_t remaining = elem_idx;\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const idx_" + std::to_string(d) + + " = remaining / " + std::to_string(strideI[d]) + ";\n"; + op += SP + SP + SP + SP + "remaining -= idx_" + std::to_string(d) + + " * " + std::to_string(strideI[d]) + ";\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "int64_t iAxis = I[elem_idx];\n"; + op += SP + SP + SP + SP + "if (iAxis < 0) iAxis += " + std::to_string(fShapeY[fAxis]) + ";\n\n"; + + op += SP + SP + SP + SP + "std::size_t const out_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == (std::size_t)fAxis) + ? "static_cast(iAxis)" + : "idx_" + std::to_string(d); + op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(strideY[d]); + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + if (fReduction.empty() || fReduction == "none") { + op += SP + SP + SP + SP + "Y[out_idx] = U[elem_idx];\n"; + } else if (fReduction == "add") { + op += SP + SP + SP + SP + "alpaka::atomicAdd(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "mul") { + op += SP + SP + SP + SP + "alpaka::atomicMul(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "max") { + op += SP + SP + SP + SP + "alpaka::atomicMax(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "min") { + op += SP + SP + SP + SP + "alpaka::atomicMin(acc, &Y[out_idx], U[elem_idx]);\n"; + } + + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + return SP + "ScatterElementsKernel_" + opName + " scatterElementsKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first"); + } + + std::size_t totalElements = ConvertShapeToLength(fShapeI); + + std::stringstream out; + out << "\n//------ SCATTERELEMENTS_GPU_ALPAKA\n"; + + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n"; + out << SP << "alpaka::wait(queue);\n\n"; + + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", scatterElementsKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNI << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); +} +}; + +}//SOFIE + + +#endif //SOFIE_ROperator_ScatterElements diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx b/core/inc/SOFIE/ROperator_Selu.hxx similarity index 90% rename from src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx rename to core/inc/SOFIE/ROperator_Selu.hxx index 96f4445..68ef253 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx +++ b/core/inc/SOFIE/ROperator_Selu.hxx @@ -38,7 +38,7 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Selu Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Selu Op Input Tensor is not found in model"); } fShape = model.GetTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); @@ -48,7 +48,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Selu called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Selu called to Generate without being initialized first"); } std::stringstream out; int length = 1; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx b/core/inc/SOFIE/ROperator_Shape.hxx similarity index 78% rename from src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx rename to core/inc/SOFIE/ROperator_Shape.hxx index 52bdeae..299de7c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx +++ b/core/inc/SOFIE/ROperator_Shape.hxx @@ -47,7 +47,7 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Shape Op Input Tensor " + fNX + " is not found in model"); + throw std::runtime_error("SOFIE Shape Op Input Tensor " + fNX + " is not found in model"); } fShape = model.GetTensorShape(fNX); size_t length = fShape.size(); // this the size of shape not length of tensor @@ -87,7 +87,7 @@ public: OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Shape op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Shape op called to Generate without being initialized first"); } std::stringstream out; @@ -101,6 +101,26 @@ public: return out.str(); } + std::string Generate_GPU_ALPAKA(std::string OpName) override { + // no need to generate code if the output is constant + if (fIsOutputConstant) return ""; + + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Shape op called to Generate without being initialized first"); + } + std::stringstream out; + + out << "\n//------ Shape\n"; + // add a dummy statement to avoid warning for unused input + out << SP << "(void) deviceBuf_" << fNX << ";\n"; + size_t length = ConvertShapeToLength(fOutput_shape); + for (size_t id = 0; id < length; id++) { + out << SP << "deviceBuf_" << fNY << "["<< id << "] = " << fShape[fStart+id] << ";\n"; + } + return out.str(); + } + }; }//SOFIE diff --git a/core/inc/SOFIE/ROperator_Sigmoid.hxx b/core/inc/SOFIE/ROperator_Sigmoid.hxx new file mode 100644 index 0000000..aa9aa09 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Sigmoid.hxx @@ -0,0 +1,127 @@ +#ifndef SOFIE_ROPERATOR_Sigmoid +#define SOFIE_ROPERATOR_Sigmoid + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE{ + +template +class ROperator_Sigmoid final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + +public: + ROperator_Sigmoid(){} + ROperator_Sigmoid(std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::SIGMOID; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Sigmoid Op Input Tensor is not found in model"); + } + fShape = model.GetTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + } + + + std::string Generate(std::string opName) override { + if (fShape.empty()){ + throw std::runtime_error("SOFIE Operator Sigmoid called to Generate without being initialized first"); + } + std::stringstream out; + int length = 1; + for(auto& i: fShape){ + length *= i; + } + out << "\n//------ Sigmoid -- " << opName << "\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = 1 / (1 + std::exp( - tensor_" << fNX << "[id]));\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ SIGMOID_KERNEL_ALPAKA\n"; + op += "struct SigmoidKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if(idx < numElements) {\n"; + op += SP + SP + SP + SP + "out[idx] = static_cast(1) / (static_cast(1) + exp(-data[idx]));\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "SigmoidKernel sigmoidKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Sigmoid called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ SIGMOID_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNX + << ", sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "static_cast(1) / (static_cast(1) + exp(-(" + v + ")))"; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + + std::vector GetStdLibs() override { return { std::string("cmath") };} +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_Sigmoid diff --git a/core/inc/SOFIE/ROperator_Slice.hxx b/core/inc/SOFIE/ROperator_Slice.hxx new file mode 100644 index 0000000..fb738cf --- /dev/null +++ b/core/inc/SOFIE/ROperator_Slice.hxx @@ -0,0 +1,592 @@ +#ifndef SOFIE_ROPERATOR_SLICE +#define SOFIE_ROPERATOR_SLICE + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include + + +namespace SOFIE{ + +// slice operator + +template +class ROperator_Slice final : public ROperator +{ + +private: + + // flags to indicate if start/end and steps are not defined at compiled time + bool fIsStartUndef = false; + bool fIsEndUndef = false; + bool fIsStepUndef = false; + bool fIdentitySlice = false; + std::string fNData; // input data tensor name + std::string fNOutput; // output data name + std::vector fNames; // tensor names for meta(axis) information + std::vector fShapeInput; // input shape + std::vector fShapeOutput; // output shape + std::vector fOutputShapeData; // output shape data in case output is a shape param tensor + + // saved Start/End.Steps are corrected from initial ONNX for negative/default values + // and are available for each axis + std::vector fStart; // starting values of slices for all axes + std::vector fEnd; // End values of slices for all axes + std::vector fSteps; // step values of slices for all axes + std::vector fStartDims; // input starting values of slices + std::vector fEndDims; // input End values of slices + std::vector fStepDims; // input step values of slices + std::vector fAxes; // axes for input start/emd/step values + + std::vector> fAttributes; // attributes for the version <=10 case + + +public: + + ROperator_Slice(){} + + // ctor for versions >= 10 + ROperator_Slice(std::string nameData, std::vector names, std::string nameOutput) + : fNData(UTILITY::Clean_name(nameData)), + fNOutput(UTILITY::Clean_name(nameOutput)) + { + fNames.resize(4); + // axes and steps can be optional + for (size_t i = 0; i < names.size(); ++i) { + fNames[i] = UTILITY::Clean_name(names[i]); + } + + fInputTensorNames = { fNData }; + fOutputTensorNames = { fNOutput }; + } + // ctor for versions < 10 + ROperator_Slice(std::string nameData, std::vector starts, std::vector ends, std::vector axes, std::string nameOutput) + : fNData(UTILITY::Clean_name(nameData)), + fNOutput(UTILITY::Clean_name(nameOutput)) + { + fAttributes.push_back(starts); + fAttributes.push_back(ends); + fAttributes.push_back(axes); + } + + + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNData) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("TMVA Slice Op Input Tensor is not found in model"); + } + + std::vector> shapes; + fShapeInput = model.GetDimTensorShape(fNData); + shapes.push_back(fShapeInput); + + std::vector> itensors(4); + + if (fNames.size() > 0) { // size has to be equal to 4 + // loop on the extra 2 or 3 or 4 inputs + for (size_t i = 0; i < 4; ++i) { + if (!fNames[i].empty()) { + if (model.IsInitializedTensor(fNames[i])) { + auto dptr = model.GetInitializedTensorData(fNames[i]); + auto tensor = static_cast(dptr.get()); + auto vec = model.GetTensorShape(fNames[i]); + assert(vec.size() == 1); + itensors[i] = std::vector(tensor, tensor + vec[0]); + + } else if (model.IsShapeTensor(fNames[i])) { + // case is a shape tensor + if (i == 0) { + fStartDims = model.GetShapeTensorValues(fNames[i]); + } else if (i == 1) { + fEndDims = model.GetShapeTensorValues(fNames[i]); + } else if (i == 3) { + fStepDims = model.GetShapeTensorValues(fNames[i]); + } + } else { + // case is an intermediate tensor + auto shape = model.GetTensorShape(fNames[i]); + size_t s = shape[0]; + for (size_t k = 0; k < s; k++) { + if (i == 0) { + fStartDims.push_back( Dim{std::string("start_") + fNOutput + "_" + std::to_string(k)}); + fIsStartUndef = true; + } else if (i == 1) { + fEndDims.push_back(Dim{std::string("end_") + fNOutput + "_" + std::to_string(k)}); + fIsEndUndef = true; + } else if (i == 3) { + fStepDims.push_back(Dim{std::string("step_") + fNOutput + "_" + std::to_string(k)}); + fIsStepUndef = true; + } + } + } + } + } + } else { + // old slice versions + assert(fAttributes.size() > 1); + for (size_t i = 0; i < fAttributes.size(); i++) { + itensors[i] = fAttributes[i]; + } + } + size_t dim = fShapeInput.size(); + + // default values + fSteps = std::vector(dim, Dim{1}); + fStart = std::vector(dim, Dim{0}); + fEnd = fShapeInput; + + // default axes + if (itensors[2].empty()) { + fAxes.resize(dim); + std::iota(fAxes.begin(), fAxes.end(), 0); + } else { + fAxes = itensors[2]; + for (size_t i = 0; i < fAxes.size(); i++) { + // negative axes - they count from the back + if (fAxes[i] < 0) fAxes[i] = dim + fAxes[i]; + if (fAxes[i] < 0 || fAxes[i] >= static_cast(dim)) + throw std::runtime_error("TMVA Slice Op : invalid axis value " + std::to_string(fAxes[i]) + + " for " + std::to_string(i)); + } + } + // Loop on axis to get start/end/step values + for (size_t i = 0; i < fAxes.size(); i++) { + if (!itensors[0].empty() ) + fStartDims.push_back(Dim{ static_cast(itensors[0][i])}); + if (fStartDims.empty()) + throw std::runtime_error("TMVA Slice Op : Missing start input tensor"); + + if (!itensors[1].empty()) + fEndDims.push_back(Dim{ static_cast(itensors[1][i])}); + else if (fEndDims.empty()) + throw std::runtime_error("TMVA Slice Op : Missing end input tensor"); + + if (!itensors[3].empty()) { + fStepDims.push_back(Dim{ static_cast(itensors[3][i])}); + } + else if (fStepDims.size() < fAxes.size()) // this can happen since it is optional + fStepDims.push_back(Dim{size_t(1)}); + + if (!fShapeInput[fAxes[i]].isParam) { + size_t iAxisDim = fShapeInput[fAxes[i]].dim; + //correct values if too large or too small + IType istart = 0; + if (!fStartDims[i].isParam) { + istart = static_cast(fStartDims[i].dim); + if (istart < 0) istart = iAxisDim + istart; + } + IType iend = static_cast(iAxisDim); + if (!fEndDims[i].isParam) { + iend = static_cast(fEndDims[i].dim); + if (iend < 0) iend = iAxisDim + iend; + } + //steps + IType istep = 1; + if (!fStepDims[i].isParam) { + istep = static_cast(fStepDims[i].dim); + } else { + throw std::runtime_error("TMVA Slice Op : parametric step inputs are not supported"); + } + // clamp start end values depending on steps + // start must be [0,N] for positive steps or [0,N-1] for negative + // end must be [0,N] for positive steps or [-1, N-1] for negative + if (istart < 0) istart = 0; + if (istep > 0) { + if (istart > static_cast(iAxisDim)) istart = static_cast(iAxisDim); + if (iend < 0) iend = 0; + if (iend > static_cast(iAxisDim)) iend = static_cast(iAxisDim); + } else if (istep < 0) { + if (istart > static_cast(iAxisDim)-1) istart = static_cast(iAxisDim) -1; + if (iend < -1) iend = -1; + if (iend > static_cast(iAxisDim)-1) iend = static_cast(iAxisDim) -1; + } else { + throw std::runtime_error("TMVA Slice Op : invalid step value " + std::to_string(istep) + + " for " + std::to_string(i)); + } + // for parametric values clamping we will done at run time + if (fStartDims[i].isParam) + fStart[fAxes[i]] = fStartDims[i]; + else + fStart[fAxes[i]] = Dim{size_t(istart)}; + if (fStartDims[i].isParam) + fEnd[fAxes[i]] = fEndDims[i]; + else + fEnd[fAxes[i]] = Dim{size_t(iend)}; + + fSteps[fAxes[i]] = Dim{size_t(istep)}; + } else { + //std::cout << i << " Param dim for " << fAxes[i] << " " << fShapeInput[fAxes[i]] << std::endl; + // correct only negative values + if (!fStartDims[i].isParam) { + IType istart = static_cast(fStartDims[i].dim); + if (istart < 0) { + std::string sstart = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-istart) +")"; + fStart[fAxes[i]] = Dim{sstart,size_t(-1)}; + } else { + fStart[fAxes[i]] = Dim{size_t(istart)}; + } + } else { + fStart[fAxes[i]] = fStartDims[i]; + } + if (!fEndDims[i].isParam) { + IType iend = static_cast(fEndDims[i].dim); + if (iend < 0) { + std::string send = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-iend) +")"; + fEnd[fAxes[i]] = Dim{send,size_t(-1)}; + } else if (iend == std::numeric_limits::max()){ + fEnd[fAxes[i]] = fShapeInput[fAxes[i]]; + } else { + fEnd[fAxes[i]] = Dim{size_t(iend)}; + } + } else { + fEnd[fAxes[i]] = fEndDims[i]; + } + + fSteps[fAxes[i]] = fStepDims[i]; + } + + } + // find output shape + fShapeOutput.resize(dim); + for (size_t i = 0; i < dim; i++) { + if (!fEnd[i].isParam && !fStart[i].isParam && !fSteps[i].isParam) { + int64_t istart = static_cast(fStart[i].dim); + int64_t iend = static_cast(fEnd[i].dim); + int64_t istep= static_cast(fSteps[i].dim); + int64_t s = (iend-istart)/istep; + fShapeOutput[i] = Dim{static_cast(s)}; + } else { + std::string s; + if (fStart[i].GetVal() != "0") + s = "(" + fEnd[i].GetVal() + "-" + fStart[i].GetVal() + ")"; + else + s = fEnd[i].GetVal(); + if (fSteps[i].GetVal() != "1") { + s.insert(0,"("); + s += ")/" + fSteps[i].GetVal() + ")"; + } + fShapeOutput[i] = Dim{s,size_t(-1)}; + // add also the shape parameters to RModel to declare them when + // allocating output tensor + if (fEnd[i].isParam && fEnd[i].dim != size_t(-1)) + model.AddShapeParam(fEnd[i].param,fEnd[i].dim ); + if (fStart[i].isParam && fStart[i].dim != size_t(-1)) + model.AddShapeParam(fStart[i].param,fStart[i].dim ); + if (fSteps[i].isParam && fSteps[i].dim != size_t(-1)) + model.AddShapeParam(fSteps[i].param,fSteps[i].dim ); + + } + } + // case input is a constant tensor and of int64 type + if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { + fIsOutputConstant = true; + auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); + size_t outputSize = ConvertShapeToLength(ConvertShapeToInt(fShapeOutput)); + std::vector outputData(outputSize); + std::vector inputStride = UTILITY::ComputeStrideFromShape(ConvertShapeToInt(fShapeInput)); + if (model.Verbose()) { + std::cout << "Do slice for initialized input ..(start, end, step)\n"; + for (size_t ii = 0; ii< fStart.size(); ii++) + std::cout << fStart [ii] << " " << fEnd[ii] << " " << fSteps[ii] << std::endl; + } + // perform slice using a recursive function- need to use two lambda functions for this + auto sliceRecursive = [&](size_t iaxis, size_t & outIdx, size_t & inOffset) { + auto slice_impl = [&](size_t iax, size_t & outputIdx, size_t & inputOffset, auto & sliceRecImpl) { + if (fStart[iax].isParam || fEnd[iax].isParam || fSteps[iax].isParam) + throw std::runtime_error("TMVA Slice Op : cannot have parametric values when input is constant"); + // compute indices + std::vector indices; + for (IType i = (IType) fStart[iax].dim; (IType(fSteps[iax].dim) > 0) ? i < IType(fEnd[iax].dim) : i > IType(fEnd[iax].dim); i += IType(fSteps[iax].dim) ) + indices.push_back(i); + if (iax == dim-1) { // last axis + for (size_t i = 0; i < indices.size(); i++) { + outputData[outputIdx] = inputData[inputOffset + indices[i]]; + outputIdx++; + } + return; + } else { + for (size_t i = 0; i < indices.size(); i++) { + size_t offset = inputOffset + inputStride[iax]*indices[i]; + sliceRecImpl(iax+1, outputIdx, offset,sliceRecImpl); + } + } + }; + slice_impl(iaxis, outIdx, inOffset,slice_impl); + }; + size_t idx = 0; + size_t offset = 0; + sliceRecursive(0, idx, offset); + + model.AddConstantTensor(fNOutput, ConvertShapeToInt(fShapeOutput), outputData.data()); + if (model.Verbose()) { + std::cout << "Slice: output is a constant tensor " << ConvertDimShapeToString(fShapeOutput) << " : " + << ConvertValuesToString(outputData) << std::endl; + } + } + else if (model.IsShapeTensor(fNData) && !fStart[0].isParam && !fEnd[0].isParam) { + // case of input is a shape tensor. In this case rank=1 always, axis =0 and Slice is trivial + auto inputData = model.GetShapeTensorValues(fNData); + fOutputShapeData = std::vector(inputData.begin() + fStart[0].dim, inputData.begin() + fEnd[0].dim); + // try to convert to integer values if possible + auto outputData = ConvertShapeToInt(fOutputShapeData); + fShapeOutput = { Dim{fOutputShapeData.size()}}; + if (outputData.empty()) { + // is a param shape tensor + model.AddShapeTensor(fNOutput, fOutputShapeData); + fIsOutputParamShape = true; + if (model.Verbose()) { + std::cout << "Slice: output is a shape tensor -> " << fNOutput << " " << ConvertDimShapeToString(fShapeOutput) << " with values " + << ConvertDimShapeToString(fOutputShapeData) << " (shape)" << std::endl; + } + } else { + fIsOutputConstant = true; + std::vector data(outputData.size()); + std::copy(outputData.begin(), outputData.end(), data.begin()); + model.AddConstantTensor(fNOutput, {data.size()}, data.data()); + if (model.Verbose()) { + std::cout << "Slice: output is a constant tensor -> " << fNOutput << " " << ConvertDimShapeToString(fShapeOutput) << " with values " + << ConvertDimShapeToString(fOutputShapeData) << " constant " << std::endl; + } + } + } + else { + // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1 + size_t ndim = fShapeInput.size(); + fIdentitySlice = fShapeOutput.size() == ndim; + // check also if input data is not input to the model. In that case we copy the data since we cannot just copy from the input pointer + fIdentitySlice &= (!model.IsReadyInputTensor(fNData) && !model.IsDimInputTensor(fNData)); + for (size_t idim = 0; idim < ndim; idim++) { + if (!fIdentitySlice) break; + fIdentitySlice &= (fStart[idim].GetVal() == "0"); + fIdentitySlice &= (fSteps[idim].GetVal() == "1"); + fIdentitySlice &= (fEnd[idim].GetVal() == fShapeInput[idim].GetVal()); + } + + model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); + //if (fIdentitySlice) model.AddAliasTensor(fNOutput, fNData); + + if (model.Verbose()) { + std::cout << "Slice " << fNData << " " << ConvertDimShapeToString(fShapeInput) + << "---> " << fNOutput << " " << ConvertDimShapeToString(fShapeOutput); + if (fIdentitySlice) std::cout << " (using alias tensor since slice is an identity) "; + std::cout << std::endl; + + } + } + } + + std::string Generate(std::string opName) override { + + if (fShapeInput.empty() || fShapeOutput.empty()){ + throw std::runtime_error("SOFIE Slice Op called to Generate without being initialized first"); + } + + std::stringstream out; + + out << "///------- Slice operator " << opName << "---> " << fNOutput << " " + << ConvertDimShapeToString(fShapeOutput) << "\n" << std::endl; + if (fIsOutputConstant) return out.str(); //no op for constant tensors + if (fIsOutputParamShape) { + out << "/// Slice output is a shape tensor with values : " << ConvertDimShapeToString(fShapeOutput) << "\n"; + // need to generate code assigning values to shape tensors + for (int i = 0; i < static_cast(fShapeOutput[0].dim); i++) { + out << SP << "tensor_" << fNOutput << "[" << i << "] = " << fOutputShapeData[i] << ";\n"; + } + return out.str(); + } + + size_t ndim = fShapeInput.size(); + + if (fIdentitySlice) { + out << "/// Slice is just an identity (copy) \n"; + //out << SP << "tensor_" << fNOutput << " = const_cast<" << ConvertTypeToString(fOutputType) << " *>(tensor_" << fNData << ");\n"; + out << SP << "std::copy(tensor_" << fNData << ", tensor_" << fNData << " + " << ConvertDimShapeToLength(fShapeInput) << ", tensor_" << fNOutput << ");\n"; + return out.str(); + } + + // loop on the dimensions depending no the orders + auto strides = UTILITY::ComputeStrideFromShape(fShapeInput); + + + out << SP << "{\n"; // define operator scope + for (size_t i = 0; i < fStepDims.size(); i++) { + if (fStepDims[i].isParam) { + if (fIsStepUndef) + out << SP << "size_t " << fStepDims[i] << " = tensor_" << fNames[3] << "[" << i << "];\n"; + } + } + // special case for parametric values for start/end. Need to do clipping + for (size_t i = 0; i < fStartDims.size(); i++) { + if (fStartDims[i].isParam && fStartDims[i].param != fShapeInput[fAxes[i]].param) { + std::string s_start = "start_" + std::to_string(i); + if (fIsStartUndef) { + s_start = fStartDims[i].param; + out << SP << "size_t " << s_start << " = tensor_" << fNames[0] << "[" << i << "];\n"; + } else { + out << SP << "size_t " << s_start << " = " << fStartDims[i] << ";\n"; + fStart[fAxes[i]] = s_start; // need to use this value later when slicing + } + out << SP << "if (" << s_start << " < 0) " << s_start << " += " << fShapeInput[fAxes[i]] <<";\n"; + out << SP << "if (" << s_start << " < 0) " << s_start << " = 0;\n"; + if (!fStepDims[i].isParam) { + if (static_cast(fStepDims[i].dim) > 0 ) + out << SP << "if (" << s_start << " > " << fShapeInput[fAxes[i]] << " ) " << s_start << " = " << fShapeInput[fAxes[i]] <<";\n"; + else + out << SP << "if (" << s_start << " > " << fShapeInput[fAxes[i]] << " - 1" << " ) " << s_start << " = " << fShapeInput[fAxes[i]] << " - 1;\n"; + } + } + // special case if step is negative and shape are equal and step is negative + else if (fStartDims[i].isParam && fStartDims[i].param == fShapeInput[fAxes[i]].param && !fStepDims[i].isParam && static_cast(fStepDims[i].dim) < 0 ) { + fStart[fAxes[i]] = Dim{ fStartDims[i].param + "-1" }; + } + } + // now to for end + for (size_t i = 0; i < fEndDims.size(); i++) { + if (fEndDims[i].isParam && fEndDims[i].param != fShapeInput[fAxes[i]].param) { + std::string s_end = "end_" + std::to_string(i); + if (fIsEndUndef) { + s_end = fEndDims[i].param; + out << SP << "size_t " << s_end << " = tensor_" << fNames[1] << "[" << i << "];\n"; + } else { + out << SP << "size_t " << s_end << " = " << fEndDims[i] << ";\n"; + fEnd[fAxes[i]] = s_end; // need to use this value later when slicing + } + out << SP << "if (" << s_end << " < 0) " << s_end << " += " << fShapeInput[fAxes[i]] <<";\n"; + if (!fStepDims[i].isParam) { + if (static_cast(fStepDims[i].dim) > 0 ) { + out << SP << "if (" << s_end << " < 0) " << s_end << " = 0;\n"; + out << SP << "if (" << s_end << " > " << fShapeInput[fAxes[i]] << " ) " << s_end << " = " << fShapeInput[fAxes[i]] <<";\n"; + } else { + out << SP << "if (" << s_end << " < -1) " << s_end << " = -1;\n"; + out << SP << "if (" << s_end << " > " << fShapeInput[fAxes[i]] << " - 1" << " ) " << s_end << " = " << fShapeInput[fAxes[i]] << " - 1;\n"; + } + } + } + // special case if step is negative and shape are equal and step is negative + else if (fEndDims[i].isParam && fEndDims[i].param == fShapeInput[fAxes[i]].param && !fStepDims[i].isParam && static_cast(fStepDims[i].dim) < 0 ) { + fEnd[fAxes[i]] = Dim{ fEndDims[i].param + "-1" }; + } + } + + out << SP << "size_t iOut = 0;\n"; + std::string MSP = SP; + for (size_t idim = 0; idim < ndim; idim++) { + out << MSP << "for (size_t i" << idim << " = " << fStart[idim] << "; i" << idim << " < " << fEnd[idim] + << "; i" << idim << "+= " << fSteps[idim] << ") {\n"; + MSP += SP; + if (idim < ndim-1) out << MSP << "size_t stride" << idim << " = " << strides[idim] << "*i" << idim << ";\n"; + } + out << MSP << "size_t iInput = "; + for (size_t idim = 0; idim < ndim-1; idim++) out << " stride" << idim << " + "; + // here should be step size ? + out << "i" << ndim-1 << ";\n"; + out << MSP << "tensor_" << fNOutput << "[iOut++] = tensor_" <\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + outputStrides[d].GetVal() + "u) % " + + fShapeOutput[d].GetVal() + "u;\n"; + } + op += "\n"; + + // Map each output coord back to input coord: + // input_coord[d] = fStart[d] + out_d * fSteps[d] + // Negative steps are supported naturally since fStart/fEnd/fSteps are + // already corrected for negative/default values during Initialize(). + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + // input coordinate for this dim: start + out_d * step + std::string input_coord = "(" + fStart[d].GetVal() + + " + out_" + std::to_string(d) + + " * " + fSteps[d].GetVal() + ")"; + op += SP + SP + SP + SP + SP + + "static_cast(" + input_coord + ")" + + " * " + inputStrides[d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "SliceKernel_" + opName; + return SP + kname + " sliceKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeOutput.empty()) + throw std::runtime_error("SOFIE Slice Op called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeOutput); + std::string kname = "sliceKernel_" + opName; + + std::stringstream out; + out << "\n//------ SLICE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNOutput << ")" + << ", static_cast(" << totalElements << "));\n"; + + return out.str(); + } + +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_SLICE diff --git a/core/inc/SOFIE/ROperator_Softmax.hxx b/core/inc/SOFIE/ROperator_Softmax.hxx new file mode 100644 index 0000000..5626c0f --- /dev/null +++ b/core/inc/SOFIE/ROperator_Softmax.hxx @@ -0,0 +1,192 @@ +#ifndef SOFIE_ROPERATOR_Softmax +#define SOFIE_ROPERATOR_Softmax + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE { + +class ROperator_Softmax final : public ROperator { + +private: + bool fLogSoftmax; // for the logsoftmax case + bool fUseVDT = false; + int64_t fAttrAxis; + + std::string fNX; + std::string fNY; + std::vector fShape; + + std::string fType; + +public: + ROperator_Softmax() {} + ROperator_Softmax(int64_t attr_axis, std::string nameX, std::string nameY, bool logSoftmax = false) + : fLogSoftmax(logSoftmax), + fAttrAxis(attr_axis), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { return input; } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; // suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == + false) { // input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Softmax Op Input Tensor is not found in model"); + } + fShape = model.GetDimTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + fType = ConvertTypeToString(model.GetTensorType(fNX)); + if (model.Verbose()) { + std::cout << "Softmax -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl; + } + fUseVDT = model.UseVDT(); + if (fUseVDT) { + model.AddNeededCustomHeader("vdt/exp.h"); + if (fLogSoftmax) + model.AddNeededCustomHeader("vdt/log.h"); + } + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Softmax called to Generate without being initialized first"); + } + std::stringstream out; + out << "///------- Softmax " << opName << " ---> " // << fNY << " " + << ConvertDimShapeToString(fShape) << "\n" << std::endl; + size_t size = fShape.size(); + auto length_str = ConvertDimShapeToLength(fShape); + size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis; + + std::string expFunction = (fUseVDT) ? "vdt::fast_expf" : "std::exp"; + std::string logFunction = (fUseVDT) ? "vdt::fast_logf" : "std::log"; + + // Check if this is the special case where memory is contiguous. + if (axis == size - 1) { + std::string axis_size = fShape[axis].GetVal(); + std::string num_rows; + if (IsInteger(length_str) && IsInteger(axis_size)) { + num_rows = std::to_string(std::stoul(length_str) / std::stoul(axis_size)); + } else { + num_rows = "(" + length_str + ") / (" + axis_size + ")"; + } + + out << SP << "//----- softmax axis is last one - " << axis << "\n"; + out << SP << "for (int i = 0; i < " << num_rows << "; ++i) {\n"; + out << SP << SP << "size_t offset = i * " << axis_size << ";\n"; + out << SP << SP << fType << " const * x_ptr = &tensor_" << fNX << "[offset];\n"; + out << SP << SP << fType << " * y_ptr = &tensor_" << fNY << "[offset];\n"; + + out << SP << SP << fType << " vmax = x_ptr[0];\n"; + out << SP << SP << "for (int j = 1; j < " << axis_size << "; ++j) {\n"; + out << SP << SP << SP << "if (x_ptr[j] > vmax) vmax = x_ptr[j];\n"; + out << SP << SP << "}\n"; + + out << SP << SP << fType << " sum = 0.0;\n"; + out << SP << SP << "for (int j = 0; j < " << axis_size << "; ++j) {\n"; + out << SP << SP << SP << "y_ptr[j] = " << expFunction << "(x_ptr[j] - vmax);\n"; + out << SP << SP << SP << "sum += y_ptr[j];\n"; + out << SP << SP << "}\n"; + + out << SP << SP << fType << " inv_sum = 1.0f / sum;\n"; + out << SP << SP << "for (int j = 0; j < " << axis_size << "; ++j) {\n"; + out << SP << SP << SP << "y_ptr[j] *= inv_sum;\n"; + if (fLogSoftmax) + out << SP << SP << SP << "y_ptr[j] = " << logFunction << "(y_ptr[j]);\n"; + out << SP << SP << "}\n"; + out << SP << "}\n"; + + } else { + // generic case for any axis + auto stride = UTILITY::ComputeStrideFromShape(fShape); + size_t k = 0; + std::vector l(size); + for (size_t i = 0; i < size; i++) { + if (i != axis) { + for (size_t j = 0; j < k; j++) out << SP; + l[i] = std::string("i") + std::to_string(i); + out << SP << "for (int " << l[i] << " = 0; " << l[i] << " < " << fShape[i] << "; " << l[i] << "++) {\n"; + k++; + } + } + for (size_t j = 0; j < size-1; j++) out << SP; + out << fType << " sum = 0.;\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "size_t index = "; + bool first = true; + for (size_t i = 0; i < size; i++) { + if (i == axis) continue; + if (!first) out << " + "; + if (stride[i].GetVal() != "1") + out << stride[i] << "*"; + out << l[i]; + first = false; + } + out << ";\n"; + // find maximum looping along reduced axis + for (size_t j = 0; j < size-1; j++) out << SP; + out << fType << " vmax = tensor_" << fNX << "[index];\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "for (int i = 1; i < " << fShape[axis] << "; i++) {\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << fType << " x = tensor_" << fNX << "[index + i"; + if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")"; + out << "];\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "if (x > vmax) vmax = x;\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "}\n"; + // compute softmax + for (size_t j = 0; j < size-1; j++) out << SP; + out << "for (int i = 0; i < " << fShape[axis] << "; i++) {\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "size_t id = index + i"; + if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")"; + out << ";\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "tensor_" << fNY << "[id] = " << expFunction << "(tensor_" << fNX << "[id] - vmax);\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "sum += tensor_" << fNY << "[id];\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "}\n"; + // normalize + for (size_t j = 0; j < size-1; j++) out << SP; + out << "for (int i = 0; i < " << fShape[axis] << "; i++) {\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "size_t id = index + i"; + if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")"; + out << ";\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "tensor_" << fNY << "[id] /= sum;\n"; + if (fLogSoftmax) { + for (size_t j = 0; j < size; j++) out << SP; + out << "tensor_" << fNY << "[id] = " << logFunction << "(tensor_" << fNY << "[id]);\n"; + } + for (size_t j = 0; j < size-1; j++) out << SP; + out << "}\n"; + //end loops + for (int i = static_cast(k) - 1; i >= 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + } + return out.str(); + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_Softmax diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx b/core/inc/SOFIE/ROperator_Split.hxx similarity index 51% rename from src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx rename to core/inc/SOFIE/ROperator_Split.hxx index 63fbcb3..9604ca8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx +++ b/core/inc/SOFIE/ROperator_Split.hxx @@ -51,14 +51,14 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Split Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Split Op Input Tensor is not found in model"); } fInputShape = model.GetTensorShape(fNX); // correct for negative axis if (fAxis < 0) fAxis += fInputShape.size(); if (fAxis < 0 || fAxis >= static_cast(fInputShape.size()) ) - throw std::runtime_error("TMVA SOFIE Split - invalid axis " + std::to_string(fAxis)); + throw std::runtime_error("SOFIE Split - invalid axis " + std::to_string(fAxis)); // compute output shapes size_t nsplit = fNYs.size(); @@ -77,10 +77,10 @@ public: } else { // get split tensor values if (!model.IsInitializedTensor(fNSplit)) - throw std::runtime_error("TMVA SOFIE Split - non-initialized split tensors are not supported"); + throw std::runtime_error("SOFIE Split - non-initialized split tensors are not supported"); auto splitShape = model.GetTensorShape(fNSplit); if (splitShape.size() != 1 || splitShape[0] != nsplit) - throw std::runtime_error("TMVA SOFIE Split - split input tensor has invalid shape"); + throw std::runtime_error("SOFIE Split - split input tensor has invalid shape"); auto split_data = static_cast(model.GetInitializedTensorData(fNSplit).get()); fSplit = std::vector(split_data, split_data + nsplit); } @@ -94,7 +94,7 @@ public: fOutputShapes.push_back(outputShape); } if (tot_split != fInputShape[fAxis]) - throw std::runtime_error("TMVA SOFIE Split - Sum of split sizes must match the input dimension along the axis"); + throw std::runtime_error("SOFIE Split - Sum of split sizes must match the input dimension along the axis"); if (model.Verbose()) { @@ -109,7 +109,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fOutputShapes.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Split called to Generate without being initialized first"); } auto input_strides = UTILITY::ComputeStrideFromShape(fInputShape); @@ -153,6 +153,105 @@ public: return out.str(); } +std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fOutputShapes.empty()) + throw std::runtime_error("SOFIE Operator Split called to Generate without being initialized first"); + + const std::size_t D = fInputShape.size(); + const std::size_t Nin = fNYs.size(); + + auto inputStrides = UTILITY::ComputeStrideFromShape(fInputShape); + + std::string op; + op = "\n//------ SPLIT_KERNEL_ALPAKA\n"; + std::cout<<"Generating GPU kernel for Split operator with input shape "<< ConvertShapeToString(fInputShape) << " and output shapes : "; + for (std::size_t i = 0; i < Nin; ++i) { + std::cout<<"Loop running for output "<\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* input,\n"; + op += SP + SP + SP + "T* output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fOutputShapes[i][d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == static_cast(fAxis)) + ? ("(out_" + std::to_string(d) + " + " + std::to_string(axis_offset) + "u)") + : ("out_" + std::to_string(d)); + op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + } + std::cout<<"Finished generating GPU kernel for Split operator "<(1));\n"; + out << SP << SP << "auto const elementsPerGrid_" << i << " = Vec::all(Idx{" << length << "});\n"; + out << SP << SP << "auto const workDiv_" << i << " = sofie_workdiv(elementsPerGrid_" << i << ");\n"; + out << SP << SP << "auto task_" << opName << "_" << i << " = alpaka::createTaskKernel(workDiv_" << i + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNYs[i] << ")" + << ", static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << "_" << i << ");\n"; + out << SP << "}\n"; + } + return out.str(); +} + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx b/core/inc/SOFIE/ROperator_SubGraph.hxx similarity index 95% rename from src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx rename to core/inc/SOFIE/ROperator_SubGraph.hxx index cb17671..572dd43 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx +++ b/core/inc/SOFIE/ROperator_SubGraph.hxx @@ -50,7 +50,7 @@ public: void Initialize(RModel& model) override { //input must be a graph input, or already initialized intermediate tensor if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE If Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE If Op Input Tensor is not found in model"); } //add the subgraph model to parent RModel and initialize them model.InitializeSubGraph(fModel_then); @@ -71,7 +71,7 @@ public: fType = type; else { if (type != fType) - throw std::runtime_error("TMVA SOFIE If Op supports only all outputs of the same type"); + throw std::runtime_error("SOFIE If Op supports only all outputs of the same type"); } model.AddIntermediateTensor(fNYs[i], fType, shape ); } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx b/core/inc/SOFIE/ROperator_Swish.hxx similarity index 89% rename from src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx rename to core/inc/SOFIE/ROperator_Swish.hxx index a2552f1..e1dc974 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx +++ b/core/inc/SOFIE/ROperator_Swish.hxx @@ -38,7 +38,7 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Swish Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Swish Op Input Tensor is not found in model"); } fShape = model.GetTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); @@ -48,7 +48,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Swish called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Swish called to Generate without being initialized first"); } std::stringstream out; int length = 1; diff --git a/core/inc/SOFIE/ROperator_Tanh.hxx b/core/inc/SOFIE/ROperator_Tanh.hxx new file mode 100644 index 0000000..9408cd5 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Tanh.hxx @@ -0,0 +1,112 @@ +#ifndef SOFIE_ROPERATOR_Tanh +#define SOFIE_ROPERATOR_Tanh + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Tanh final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + +public: + ROperator_Tanh(){} + ROperator_Tanh(std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::TANH; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + //input must be a graph input, or already initialized intermediate tensor + if (model.CheckIfTensorAlreadyExist(fNX) == false){ + throw std::runtime_error("SOFIE Tanh Op Input Tensor is not found in model"); + } + fShape = model.GetTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + + } + + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Tanh operator called to Generate without being initialized first"); + } + std::stringstream out; + size_t length = ConvertShapeToLength(fShape); + out << "\n//------ TANH\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = std::tanh(tensor_" << fNX << "[id]);\n"; + out << SP << "}\n"; + return out.str(); + } + + std::vector GetStdLibs() override { return { std::string("cmath") };} + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "tanh(" + v + ")"; + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ TANH_KERNEL_ALPAKA\n"; + op += "struct TanhKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if (idx < numElements) { out[idx] = tanh(data[idx]); }\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "TanhKernel tanhKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Tanh called to Generate_GPU_ALPAKA without being initialized"); + } + std::stringstream out; + size_t length = ConvertShapeToLength(fShape); + out << "\n//------ TANH_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNX + << ", tanhKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_Tanh diff --git a/core/inc/SOFIE/ROperator_Tile.hxx b/core/inc/SOFIE/ROperator_Tile.hxx new file mode 100644 index 0000000..5a3921e --- /dev/null +++ b/core/inc/SOFIE/ROperator_Tile.hxx @@ -0,0 +1,249 @@ +#ifndef SOFIE_ROPERATOR_Tile +#define SOFIE_ROPERATOR_Tile + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Tile final : public ROperator +{ + +private: + + std::string fNRepeats; + std::string fNInput; + std::string fNY; + std::vector fShapeInput; + std::vector fShapeY; + std::vector fRepeats; + +public: + ROperator_Tile(){} + ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY): + fNRepeats(UTILITY::Clean_name(nameRepeat)), + fNInput(UTILITY::Clean_name(nameInput)), + fNY(UTILITY::Clean_name(nameY)) { + fInputTensorNames = { fNRepeats, fNInput }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + std::vector ret = input[0]; + for (size_t i = 0; i < input[1].size(); i++) + ret[i] = ret[i] * input[1][i]; + return {ret}; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNInput) == false) + throw std::runtime_error("SOFIE Tile Op Input Tensor is not found in model"); + if (model.CheckIfTensorAlreadyExist(fNRepeats) == false) + throw std::runtime_error("SOFIE Tile Op Repeats Tensor is not found in model"); + + fShapeInput = model.GetTensorShape(fNInput); + + if (!model.IsInitializedTensor(fNRepeats)) + throw std::runtime_error("SOFIE Tile Op: non-initialized repeats input is not supported"); + + auto repptr = model.GetInitializedTensorData(fNRepeats); + auto repeats_data = static_cast(repptr.get()); + if (repeats_data == nullptr) + throw std::runtime_error("SOFIE Tile Op: failed to retrieve repeats tensor data"); + + auto repeats_shape = model.GetTensorShape(fNRepeats); + if (repeats_shape.size() != 1) + throw std::runtime_error("SOFIE Tile Op: repeats tensor must be 1D"); + + size_t num_elements = repeats_shape[0]; + + // Save repeats if known at generation time so the GPU kernel can bake + // fShapeInput[d] directly without needing a runtime repeats pointer. + // fRepeats is left empty if repeats are not initialized (future case), + // which will cause the kernel to use the runtime repeats pointer path. + fRepeats.resize(num_elements); + std::copy(repeats_data, repeats_data + num_elements, fRepeats.begin()); + if (fRepeats.size()){ + model.RemoveInitializedTensor(fNRepeats); + } + fShapeY = ShapeInference({fShapeInput, fRepeats})[0]; + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY); + + if (model.Verbose()) + std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) + << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) + << " given repeats " << ConvertShapeToString(fRepeats) << std::endl; + } + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Tile Op called to Generate without being initialized first"); + + std::stringstream out; + std::string input = "tensor_" + fNInput; + std::string output = "tensor_" + fNY; + std::string repeats = "tensor_" + fNRepeats; + + out << "///-------- Tile operator\n"; + out << "{\n"; + + out << SP << "const int input_shape[" << fShapeInput.size() << "] = {"; + for (size_t i = 0; i < fShapeInput.size(); ++i) { + if (i > 0) out << ", "; + out << fShapeInput[i]; + } + out << "};\n"; + + out << SP << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n"; + out << SP << "int s = 1;\n"; + + // Read repeats from the tensor at runtime so the generated code remains + // correct even if repeats become a runtime input/intermediate in the future + out << SP << "for (int i = " << fShapeInput.size() - 1 << "; i >= 0; i--) {\n"; + out << SP << SP << "int r = " << repeats << "[i];\n"; + out << SP << SP << "int i_offset = 0, o_offset = 0;\n"; + out << SP << SP << "s = s * input_shape[i];\n"; + out << SP << SP << "if (i == " << fShapeInput.size() - 1 << ") {\n"; + out << SP << SP << SP << "for (int j = 0; j < inputLength / s; j++) {\n"; + out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n"; + out << SP << SP << SP << SP << SP << "std::copy(" << input << " + i_offset, " + << input << " + i_offset + s, " + << output << " + o_offset);\n"; + out << SP << SP << SP << SP << SP << "o_offset += s;\n"; + out << SP << SP << SP << SP << "}\n"; + out << SP << SP << SP << SP << "i_offset += s;\n"; + out << SP << SP << SP << "}\n"; + out << SP << SP << "} else {\n"; + out << SP << SP << SP << "for (int j = inputLength / s - 1; j >= 0; j--) {\n"; + out << SP << SP << SP << SP << "o_offset = j * s * r;\n"; + out << SP << SP << SP << SP << "i_offset = j * s;\n"; + out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n"; + out << SP << SP << SP << SP << SP << "std::copy(" << output << " + i_offset, " + << output << " + i_offset + s, " + << output << " + o_offset);\n"; + out << SP << SP << SP << SP << SP << "o_offset += s;\n"; + out << SP << SP << SP << SP << "}\n"; + out << SP << SP << SP << "}\n"; + out << SP << SP << "}\n"; + out << SP << SP << "s *= r;\n"; + out << SP << SP << "inputLength *= r;\n"; + out << SP << "}\n"; + out << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Operator Tile called to Generate without being initialized first"); + + const std::size_t D = fShapeInput.size(); + + auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeInput); + auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeY); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + + // If fRepeats is populated, repeats were known at generation time and + // we can bake fShapeInput[d] as literals — no runtime repeats pointer needed. + // If fRepeats is empty (future: runtime repeats), pass repeats as a kernel arg. + bool repeatsKnown = !fRepeats.empty(); + + std::string kname = "TileKernel_" + opName; + + std::string op; + op = "\n//------ TILE_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + if (!repeatsKnown) + op += SP + SP + SP + "int64_t const* __restrict__ repeats,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose output linear index — output strides always compile-time + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + // Input index: fShapeInput[d] is always a compile-time constant since + // it is the input tensor shape, never runtime-variable. + // When repeatsKnown, we bake it directly as a literal. + // When not repeatsKnown (future), we still use fShapeInput[d] as a + // literal for the % — repeats pointer is only needed if fShapeY is dynamic. + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + SP + + "(out_" + std::to_string(d) + " % " + std::to_string(fShapeInput[d]) + "u)" + + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "TileKernel_" + opName; + return SP + kname + " tileKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Operator Tile called to Generate without being initialized first"); + + bool repeatsKnown = !fRepeats.empty(); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "tileKernel_" + opName; + + // Build argument list once, reused for both getValidWorkDiv and exec + std::string args = + "alpaka::getPtrNative(deviceBuf_" + fNInput + "), " + + "alpaka::getPtrNative(deviceBuf_" + fNY + ")"; + if (!repeatsKnown) + args += ", alpaka::getPtrNative(deviceBuf_" + fNRepeats + ")"; + args += ", static_cast(" + std::to_string(totalElements) + ")"; + + std::stringstream out; + out << "\n//------ TILE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname << ", " << args << ");\n"; + out << SP <<"alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); + } + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_Tile diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx b/core/inc/SOFIE/ROperator_TopK.hxx similarity index 94% rename from src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx rename to core/inc/SOFIE/ROperator_TopK.hxx index 06d8179..7db1768 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx +++ b/core/inc/SOFIE/ROperator_TopK.hxx @@ -48,7 +48,7 @@ public: std::vector> ShapeInference(std::vector> input) override { if (input.size() != 2) { - throw std::runtime_error("TMVA SOFIE TopK Op Shape Inference needs exactly 2 input tensors"); + throw std::runtime_error("SOFIE TopK Op Shape Inference needs exactly 2 input tensors"); } auto shape = input[0]; // Shape format: [ m x n x o x p ... ] @@ -62,11 +62,11 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false) { // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE TopK Op Input Tensor is not found in model"); } if (model.CheckIfTensorAlreadyExist(fNK) == false) { // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor i.e. K is not found in model"); + throw std::runtime_error("SOFIE TopK Op Input Tensor i.e. K is not found in model"); } fShapeX = model.GetTensorShape(fNX); @@ -77,7 +77,7 @@ public: fAttrAxis = fAttrAxis < 0 ? fShapeX.size() + fAttrAxis : fAttrAxis; if(static_cast(fAttrAxis) >= fShapeX.size()){ throw - std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+fShapeX.size()+" ."); + std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+std::to_string(fShapeX.size())+" ."); } // fK cannot be larger that axis dimension fK = std::min(fK, fShapeX[fAttrAxis]); @@ -111,7 +111,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShapeX.empty()) { - throw std::runtime_error("TMVA SOFIE Operator TopK called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator TopK called to Generate without being initialized first"); } std::stringstream out; size_t size = fShapeX.size(); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx b/core/inc/SOFIE/ROperator_Transpose.hxx similarity index 65% rename from src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx rename to core/inc/SOFIE/ROperator_Transpose.hxx index 11c40bb..83508d0 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx +++ b/core/inc/SOFIE/ROperator_Transpose.hxx @@ -46,10 +46,10 @@ public: } std::vector> ShapeInference(std::vector> input) override { - if (input.size() > 1) throw std::runtime_error("TMVA SOFIE Tranpose Op Shape Inference only need 1 input tensor"); + if (input.size() > 1) throw std::runtime_error("SOFIE Tranpose Op Shape Inference only need 1 input tensor"); auto& data = input[0]; if (fAttrPerm.size() != data.size() ) - throw std::runtime_error("TMVA SOFIE Tranpose Op - Invalid axes attributes"); + throw std::runtime_error("SOFIE Tranpose Op - Invalid axes attributes"); std::vector output_shape(fAttrPerm.size()); for (size_t i = 0; i < fAttrPerm.size(); i++){ @@ -64,7 +64,7 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNData) == false){ //input must be a graph input, or already initialized intermediate tensor std::cout<<"Input tensor for transpose: "<\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,"; + op += "const std::size_t totalElements) const {\n"; + op += SP + SP + SP + SP + "auto const idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + SP + "if(idx >= totalElements) return;\n"; + op += SP + SP + SP + SP + "std::size_t input_idx = 0;\n"; + op += SP + SP + SP + SP + "std::size_t remaining = idx;\n"; + op += SP + SP + SP + SP + "std::size_t coord;\n"; + + auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeData); + auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeOutput); + + for (size_t k = 0; k < fShapeData.size(); k++) { + op += SP + SP + SP + SP + "coord = remaining / " + + std::to_string(outputStrides[k]) + "u;\n"; + op += SP + SP + SP + SP + "remaining = remaining - coord * " + + std::to_string(outputStrides[k]) + "u;\n"; + op += SP + SP + SP + SP + "input_idx += coord * " + + std::to_string(inputStrides[fAttrPerm[k]]) + "u;\n"; + } + + op += SP + SP + SP + SP + "output[idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override { + return SP + "TransposeKernel_op_" + OpName + " transposeKernel_" + OpName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fShapeOutput.empty()) { + throw std::runtime_error("SOFIE Operator Transpose called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertShapeToLength(fShapeOutput); + + out << "\n//------ TRANSPOSE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNOutput + << ", transposeKernel_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fNData + << "), alpaka::getPtrNative(deviceBuf_" << fNOutput << "), static_cast(" << length << "));\n"; + out << SP <<"alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } }; diff --git a/core/inc/SOFIE/ROperator_Where.hxx b/core/inc/SOFIE/ROperator_Where.hxx new file mode 100644 index 0000000..b9956e9 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Where.hxx @@ -0,0 +1,613 @@ +#ifndef SOFIE_ROperator_Where +#define SOFIE_ROperator_Where + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE{ + +template +class ROperator_Where final : public ROperator{ +private: + + bool fIsInputBoolTensor = false; + + + std::string fNX; + std::string fNY; + std::string fNC; + std::string fNBroadcastedX; + std::string fNBroadcastedY; + std::string fNBroadcastedC; + std::string fNZ; + + + + // static shapes (used when tensors are not dynamic) ) + std::vector fShapeX; + std::vector fShapeY; + std::vector fShapeC; + std::vector fShapeZ; + + // Dynamic generic shapes + std::vector fDimShapeC; + std::vector fDimShapeX; + std::vector fDimShapeY; + std::vector fDimShapeZ; + + // Broadcast flag: mirrors convention of BasicBinary + // bit 0: broadcast Y->X (Y needs expanding) + // bit 1: broadcast X->Y (X needs expanding) + // bit 2: broadcast C->Z (C needs expanding) + // bit 4: shapes may differ at runtime (dynamic) + int fBroadcastFlag = 0; + +public: + ROperator_Where(){} + ROperator_Where(const std::string & nameC, const std::string & nameX, const std::string & nameY, const std::string & nameZ): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), fNC(UTILITY::Clean_name(nameC)), fNZ(UTILITY::Clean_name(nameZ)){ + fInputTensorNames = { fNX, fNY, fNC }; + fOutputTensorNames = { fNZ }; + } + + // type of output given input + std::vector TypeInference(std::vector input) override { + return input; + } + + // shape of output tensors given input tensors + std::vector> ShapeInference(std::vector> input) override { + // assume now inputs have same shape (no broadcasting) + auto ret = std::vector>(1, input[0]); // return vector size 1 with first input + return ret; + } + + void Initialize(RModel& model) override { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNX)){ + throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNX + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNY)) { + throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNY + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNC)) { + throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNC + "is not found in model"); + } + // check if fNC input tensor is boolean + if (model.IsReadyInputTensor(fNC)) + fIsInputBoolTensor = true; + + // ---------------------------------------------------------------- // + // Collect shapes – dynamic or static + // ---------------------------------------------------------------- // + int dynamicInputs = 0; // bitmask: bit0=C, bit1=X, bit2=Y + + if (model.IsDynamicTensor(fNC)) { + fDimShapeC = model.GetDynamicTensorShape(fNC); + dynamicInputs |= 1; + } else { + fShapeC = model.GetTensorShape(fNC); + fDimShapeC = ConvertShapeToDim(fShapeC); + } + if (model.IsDynamicTensor(fNX)) { + fDimShapeX = model.GetDynamicTensorShape(fNX); + dynamicInputs |= 2; + } else { + fShapeX = model.GetTensorShape(fNX); + fDimShapeX = ConvertShapeToDim(fShapeX); + } + if (model.IsDynamicTensor(fNY)) { + fDimShapeY = model.GetDynamicTensorShape(fNY); + dynamicInputs |= 4; + } else { + fShapeY = model.GetTensorShape(fNY); + fDimShapeY = ConvertShapeToDim(fShapeY); + } + + + if (model.Verbose()) { + if (dynamicInputs & 1) + std::cout << "Where : condition " << fNC << " is dynamic " << ConvertDimShapeToString(fDimShapeC) << "\n"; + if (dynamicInputs & 2) + std::cout << "Where : " << fNX << " is dynamic " << ConvertDimShapeToString(fDimShapeX) << "\n"; + if (dynamicInputs & 4) + std::cout << "Where : Y " << fNZ << " is dynamic " << ConvertDimShapeToString(fDimShapeZ) << "\n"; + } + + // ---------------------------------------------------------------- // + // Static path: all shapes known at code-gen time + // ---------------------------------------------------------------- // + if (dynamicInputs == 0) { + + bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY) || !UTILITY::AreSameShape(fShapeX, fShapeC); + if (broadcast) { + // find shape to broadcast between X,Y,C looking for max length + size_t lengthX = ConvertShapeToLength(fShapeX); + size_t lengthY = ConvertShapeToLength(fShapeY); + size_t lengthC = ConvertShapeToLength(fShapeC); + bool broadcastX = false, broadcastY = false, broadcastC = false; + if (lengthX >= lengthY && lengthX >= lengthC) { + fShapeZ = fShapeX; + // broadcast Y and C if different than X + broadcastY = (lengthY != lengthX); + broadcastC = (lengthC != lengthX); + } else if (lengthY >= lengthX && lengthY >= lengthC) { + fShapeZ = fShapeY; + // broadcast X and C if different than Y + broadcastX = (lengthX != lengthY); + broadcastC = (lengthC != lengthY); + } else if (lengthC >= lengthX && lengthC >= lengthY) { + fShapeZ = fShapeC; + // broadcast X and Y if different than C + broadcastX = (lengthX != lengthC); + broadcastY = (lengthY != lengthC); + } + + // Broadcast X to Z + if (broadcastX) { + fNBroadcastedX = "BC_" + fNX + "_to_" + fNZ; + if (model.IsInitializedTensor(fNX)) { + auto data = model.GetInitializedTensorData(fNX); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX, fShapeZ), + std::default_delete()); + // Update the data and the shape of X + model.AddConstantTensor(fNBroadcastedX, model.GetTensorType(fNX), fShapeZ, broadcastedData); + fShapeX = fShapeZ; + } else { + // I need to prepend to shape of X the extra dimensions added for broadcasting to Z + if (fShapeX.size() < fShapeZ.size()) { + size_t nPrepend = fShapeZ.size() - fShapeX.size(); + fShapeX.insert(fShapeX.begin(), nPrepend, 1); + } + } + } + // Broadcast Y to Z + if (broadcastY) { + fNBroadcastedY = "BC_" + fNY + "_to_" + fNZ; + if (model.IsInitializedTensor(fNY)) { + auto data = model.GetInitializedTensorData(fNY); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeY, fShapeZ), + std::default_delete()); + // do not update tensor B but add broadcasted one (since it can be input to some other operators) + model.AddConstantTensor(fNBroadcastedY, model.GetTensorType(fNY), fShapeZ, broadcastedData); + fShapeY = fShapeZ; + } else { + // I need to prepend to shape of Y the extra dimensions added for broadcasting to Z + if (fShapeY.size() < fShapeZ.size()) { + size_t nPrepend = fShapeZ.size() - fShapeY.size(); + fShapeY.insert(fShapeY.begin(), nPrepend, 1); + } + + } + } + // Broadcast C to Z + if (broadcastC) { + fNBroadcastedC = "BC_" + fNC + "_to_" + fNZ; + if (model.IsInitializedTensor(fNC)) { + auto data = model.GetInitializedTensorData(fNC); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeC, fShapeZ), + std::default_delete()); + // do not update tensor C but add broadcasted one (since it can be input to some other operators) + model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeZ, broadcastedData); + fShapeC = fShapeZ; + } else { + // I need to prepend to shape of C the extra dimensions added for broadcasting to Z + if (fShapeC.size() < fShapeZ.size()) { + size_t nPrepend = fShapeZ.size() - fShapeC.size(); + fShapeC.insert(fShapeC.begin(), nPrepend, 1); + } + } + } + } else { + fShapeZ = fShapeX; + } + // check case of constant output (if all inputs are defined) + if (model.IsInitializedTensor(fNC)) { + std::string nameC = fNBroadcastedC.empty() ? fNC : fNBroadcastedC; + auto dataC = static_cast(model.GetInitializedTensorData(nameC).get()); + model.SetNotWritableInitializedTensor(nameC); + T *dataX = nullptr; + T *dataY = nullptr; + std::vector shapeDataX; + std::vector shapeDataY; + if (model.IsInitializedTensor(fNX)) { + std::string nameX = fNBroadcastedX.empty() ? fNX : fNBroadcastedX; + dataX = static_cast(model.GetInitializedTensorData(nameX).get()); + // flag tensors to not be written in a file + model.SetNotWritableInitializedTensor(nameX); + } else if (model.IsShapeTensor(fNX)) { + shapeDataX = model.GetShapeTensorValues(fNX); + } + if (model.IsInitializedTensor(fNY)) { + std::string nameY = fNBroadcastedY.empty() ? fNY : fNBroadcastedY; + dataY = static_cast(model.GetInitializedTensorData(nameY).get()); + model.SetNotWritableInitializedTensor(nameY); + } else if (model.IsShapeTensor(fNY)) { + shapeDataY = model.GetShapeTensorValues(fNY); + } + std::vector dataZ; // used in case output is constant tensor + std::vector shapeDataZ; // used in case output is a shape tensor (can be also constant if all + // dimensions are not parametric) + // if fNC (condition) is initialized we know the output is a shape or a constant tensor, + // so we can compute it at initialization and add it as a constant tensor to the model + // (and not add the operator output as intermediate tensor to the model) + bool isOutputConstantTensor = true; + if (dataX && dataY) { + dataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < dataZ.size(); i++) + dataZ[i] = (dataC[i]) ? dataX[i] : dataY[i]; + if (model.Verbose()) + std::cout << "data A and B : dataZ constant: " << ConvertValuesToString(dataZ) << std::endl; + } else if (dataX && shapeDataY.size() > 0) { + shapeDataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < shapeDataZ.size(); i++) { + shapeDataZ[i] = (dataC[i]) ? Dim{size_t(dataX[i])} : shapeDataY[i]; + isOutputConstantTensor &= !shapeDataZ[i].isParam; + } + if (model.Verbose()) + std::cout << "data A but shapeB " << ConvertDimShapeToString(shapeDataY) << " " + << isOutputConstantTensor << std::endl; + } else if (dataY && shapeDataX.size() > 0) { + shapeDataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < shapeDataZ.size(); i++) { + shapeDataZ[i] = (dataC[i]) ? shapeDataY[i] : Dim{size_t(dataY[i])}; + isOutputConstantTensor &= !shapeDataZ[i].isParam; + } + if (model.Verbose()) + std::cout << "data B but shapeA " << ConvertDimShapeToString(shapeDataX) << " " + << isOutputConstantTensor << std::endl; + } else if (shapeDataY.size() > 0 && shapeDataX.size() > 0) { + shapeDataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < shapeDataZ.size(); i++) { + shapeDataZ[i] = (dataC[i]) ? shapeDataX[i] : shapeDataY[i]; + isOutputConstantTensor &= !shapeDataZ[i].isParam; + } + if (model.Verbose()) + std::cout << " shapeA and B " << ConvertDimShapeToString(shapeDataX) << " shapeB " + << ConvertDimShapeToString(shapeDataY) << " " << isOutputConstantTensor << std::endl; + } + fIsOutputConstant = true; + // add as constant or shape tensor depending on the case + if (dataZ.size() > 0) + model.AddConstantTensor(fNZ, fShapeZ, dataZ.data()); + else if (shapeDataZ.size() > 0) + model.AddShapeTensor(fNZ, shapeDataZ, fShapeZ.size() == 0); + else { + fIsOutputConstant = false; + } + if (fIsOutputConstant && model.Verbose()) + std::cout << "Where op ---> " << fNZ << " " << ConvertShapeToString(fShapeZ) << " : " + << ((dataZ.size() > 0) ? ConvertValuesToString(dataZ) : ConvertDimShapeToString(shapeDataZ)) + << ((dataZ.size() > 0) ? " (constant)" : " (shape)") << std::endl; + + // output is a constant tensor + if (fIsOutputConstant) + fOutputTensorNames.pop_back(); + } + if (!fIsOutputConstant) { + + fDimShapeZ = ConvertShapeToDim(fShapeZ); + model.AddIntermediateTensor(fNZ, model.GetTensorType(fNX), fShapeZ); + if (model.Verbose()) + std::cout << "Where : condition : " << fNC << " " << ConvertShapeToString(fShapeC) << " X " + << fNX << " " << ConvertShapeToString(fShapeX) << " Y " << fNY << " " + << ConvertShapeToString(fShapeY) << " ---> " << fNZ << " " << ConvertShapeToString(fShapeZ) + << std::endl; + } + } else { + // ---------------------------------------------------------------- // + // Dynamic path: at least one input has a parametric shape + // Need to use BroadcastShape to find output shape + // ---------------------------------------------------------------- // + auto retXY = UTILITY::MultidirectionalBroadcastShape(fDimShapeX, fDimShapeY); + fBroadcastFlag = retXY.first; + fDimShapeZ = retXY.second; + auto retCZ = UTILITY::MultidirectionalBroadcastShape(fDimShapeC, fDimShapeZ); + fBroadcastFlag |= retCZ.first; + fDimShapeZ = retCZ.second; + + // Resolve std::max params to actual input dim params (same logic as BasicBinary) + if (fBroadcastFlag & 4) { + auto IsInputDimParam = [&](const std::string &p) { + for (auto &input : model.GetInputTensorNames()) + for (auto &s : model.GetDimTensorShape(input)) + if (s.isParam && s.param == p) return true; + return false; + }; + for (size_t i = 0; i < fDimShapeZ.size(); i++) { + auto &s = fDimShapeZ[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + // prefer A dim over B dim + if (i < fDimShapeX.size() && IsInputDimParam(fDimShapeX[i].param)) { + s = (fDimShapeX[i].dim != 1) ? fDimShapeX[i] : fDimShapeY[i]; + } else if (i < fDimShapeY.size() && IsInputDimParam(fDimShapeY[i].param)) { + s = (fDimShapeY[i].dim != 1) ? fDimShapeY[i] : fDimShapeX[i]; + } + } + } + } + // I need to prepend to shape of X,Y,C the extra dimensions added for broadcasting to Z + if (fDimShapeX.size() < fDimShapeZ.size()) { + size_t nPrepend = fDimShapeZ.size() - fDimShapeX.size(); + fDimShapeX.insert(fDimShapeX.begin(), nPrepend, Dim{1}); + } + if (fDimShapeY.size() < fDimShapeZ.size()) { + size_t nPrepend = fDimShapeZ.size() - fDimShapeY.size(); + fDimShapeY.insert(fDimShapeY.begin(), nPrepend, Dim{1}); + } + if (fDimShapeC.size() < fDimShapeZ.size()) { + size_t nPrepend = fDimShapeZ.size() - fDimShapeC.size(); + fDimShapeC.insert(fDimShapeC.begin(), nPrepend, Dim{1}); + } + + model.AddIntermediateTensor(fNZ, model.GetTensorType(fNX), fDimShapeZ); + + if (model.Verbose()) + std::cout << "Where (dynamic) : C=" << ConvertDimShapeToString(fDimShapeC) + << " A=" << ConvertDimShapeToString(fDimShapeX) + << " B=" << ConvertDimShapeToString(fDimShapeY) + << " --> Y=" << ConvertDimShapeToString(fDimShapeZ) << "\n"; + } + } + + std::string GenerateInitCode() override { + std::stringstream out; + return out.str(); + } + + std::string Generate(std::string opName) override { + + opName = "op_" + opName; + std::stringstream out; + out << SP << "\n//------ WHERE " << opName << " --> " << ConvertDimShapeToString(fDimShapeZ) << "\n"; + if (fIsOutputConstant) return out.str(); + + + // ---------------------------------------------------------------- // + // Runtime broadcast validation (dynamic shapes, flag bit 4) + // ---------------------------------------------------------------- // + if (fBroadcastFlag & 4) { + auto lengthX = ConvertDimShapeToLength(fDimShapeX); + auto lengthY = ConvertDimShapeToLength(fDimShapeY); + auto lengthC = ConvertDimShapeToLength(fDimShapeC); + out << SP << "if (" << lengthX << " != " << lengthY << " || " + << lengthX << " != " << lengthC << ") {\n"; + for (size_t i = 0; i < fDimShapeZ.size(); i++) { + // validate X vs Z + if (i < fDimShapeX.size() && fDimShapeX[i].isParam) { + out << SP << SP << "if (" << fDimShapeX[i] << " != 1 && " + << fDimShapeX[i] << " != " << fDimShapeZ[i] << ")\n"; + out << SP << SP << SP + << "throw std::runtime_error(\"SOFIE Where: cannot broadcast A dim " << i << " in " << opName << "\");\n"; + } + // validate Y vs Z + if (i < fDimShapeY.size() && fDimShapeY[i].isParam) { + out << SP << SP << "if (" << fDimShapeY[i] << " != 1 && " + << fDimShapeY[i] << " != " << fDimShapeZ[i] << ")\n"; + out << SP << SP << SP + << "throw std::runtime_error(\"SOFIE Where: cannot broadcast B dim " << i << " in " << opName << "\");\n"; + } + // validate C vs Z + if (i < fDimShapeC.size() && fDimShapeC[i].isParam) { + out << SP << SP << "if (" << fDimShapeC[i] << " != 1 && " + << fDimShapeC[i] << " != " << fDimShapeZ[i] << ")\n"; + out << SP << SP << SP + << "throw std::runtime_error(\"SOFIE Where: cannot broadcast C dim " << i << " in " << opName << "\");\n"; + } + } + out << SP << "}\n"; + } + // implement now where using teh strides and looping on the different dimensions + // ---------------------------------------------------------------- // + // Generate loop(s) with per-dimension stride-based index arithmetic + // ---------------------------------------------------------------- // + auto stridesX = UTILITY::ComputeStrideFromShape(fDimShapeX); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + auto stridesC = UTILITY::ComputeStrideFromShape(fDimShapeC); + auto stridesZ = UTILITY::ComputeStrideFromShape(fDimShapeZ); + + auto buildIdxExpr = [&](const std::vector &dimShape, + const std::vector &strides, + size_t rankZ) -> std::string { + if (dimShape.empty() || + std::all_of(dimShape.begin(), dimShape.end(), + [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) + return "0"; + std::string expr; + size_t offset = rankZ - dimShape.size(); + for (size_t i = 0; i < dimShape.size(); ++i) { + if (dimShape[i].dim == 1 || dimShape[i].GetVal() == "1") continue; + expr += "idx_" + std::to_string(i + offset); + if (strides[i].GetVal() != "1") + expr += " * " + strides[i].GetVal(); + expr += " + "; + } + if (expr.size() >= 3) + for (int j = 0; j < 3; j++) expr.pop_back(); // remove trailing " + " + return expr.empty() ? "0" : expr; + }; + + std::string idxX = buildIdxExpr(fDimShapeX, stridesX, fDimShapeZ.size()); + std::string idxY = buildIdxExpr(fDimShapeY, stridesY, fDimShapeZ.size()); + std::string idxC = buildIdxExpr(fDimShapeC, stridesC, fDimShapeZ.size()); + + // Emit nested loops over output shape + int nloop = 0; + std::string idxZ; + // case Z is a scalar (all dimensions are 1) or Z has no dimension + if (fDimShapeZ.empty() || + std::all_of(fDimShapeZ.begin(), fDimShapeZ.end(), + [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + idxZ = "0"; + } else { + for (size_t i = 0; i < fDimShapeZ.size(); ++i) { + if (fDimShapeZ[i].dim != 1 && fDimShapeZ[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i + << " < " << fDimShapeZ[i] << "; ++idx_" << i << ") {\n"; + idxZ += "idx_" + std::to_string(i); + if (stridesZ[i].GetVal() != "1") + idxZ += " * " + stridesZ[i].GetVal(); + idxZ += " + "; + } + } + if (idxZ.size() >= 3) + for (int j = 0; j < 3; j++) idxZ.pop_back(); + } + + // Inner assignment + for (int j = 0; j < nloop + 1; j++) out << SP; + out << "tensor_" << fNZ << "[" << idxZ << "] = " + << "tensor_" << fNC << "[" << idxC << "] ? " + << "tensor_" << fNX << "[" << idxX << "] : " + << "tensor_" << fNY << "[" << idxY << "];\n"; + + // Close loops + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + + return out.str(); + } + + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeZ.empty()) + throw std::runtime_error("SOFIE Where Op called to Generate without being initialized first"); + + const std::size_t D = fShapeZ.size(); + std::size_t totalElements = ConvertShapeToLength(fShapeZ); + + std::vector shapeA_padded(D, 1); + std::vector shapeB_padded(D, 1); + std::vector shapeC_padded(D, 1); + { + size_t offA = D - fShapeX.size(); + for (size_t i = 0; i < fShapeX.size(); ++i) shapeA_padded[offA + i] = fShapeX[i]; + size_t offB = D - fShapeY.size(); + for (size_t i = 0; i < fShapeY.size(); ++i) shapeB_padded[offB + i] = fShapeY[i]; + size_t offC = D - fShapeC.size(); + for (size_t i = 0; i < fShapeC.size(); ++i) shapeC_padded[offC + i] = fShapeC[i]; + } + + auto stridesA = UTILITY::ComputeStrideFromShape(shapeA_padded); + auto stridesB = UTILITY::ComputeStrideFromShape(shapeB_padded); + auto stridesC = UTILITY::ComputeStrideFromShape(shapeC_padded); + auto stridesZ = UTILITY::ComputeStrideFromShape(fShapeZ); + + std::string typeName = TensorType::Name(); + std::string kname = "WhereKernel_" + opName; + + std::string op; + op = "\n//------ WHERE_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ x,\n"; + op += SP + SP + SP + "T const* __restrict__ y,\n"; + op += SP + SP + SP + "uint8_t const* __restrict__ cond,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesZ[d]) + "u) % " + + std::to_string(fShapeZ[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const c_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeC_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesC[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const x_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeA_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesA[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const y_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeB_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesB[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = cond[c_idx] ? x[x_idx] : y[y_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "WhereKernel_" + opName; + return SP + kname + " whereKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeZ.empty()) + throw std::runtime_error("SOFIE Where Op called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeZ); + std::string kname = "whereKernel_" + opName; + + std::stringstream out; + out << "\n//------ WHERE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNC << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNZ << ")" + << ", static_cast(" << totalElements << "));\n"; + + return out.str(); + } + +}; + +}//SOFIE + +#endif //TMVA_SOFIE_ROperator_Where diff --git a/src/SOFIE_core/inc/SOFIE/SOFIEHelpers.hxx b/core/inc/SOFIE/SOFIEHelpers.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/SOFIEHelpers.hxx rename to core/inc/SOFIE/SOFIEHelpers.hxx diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/core/inc/SOFIE/SOFIE_common.hxx similarity index 68% rename from src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx rename to core/inc/SOFIE/SOFIE_common.hxx index d183052..e36df0a 100644 --- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx +++ b/core/inc/SOFIE/SOFIE_common.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_SOFIE_COMMON #define SOFIE_SOFIE_COMMON -#include "TMVA/RTensor.hxx" +#include "SOFIE/RTensor.hxx" -#include "ROOT/RSpan.hxx" +#include #include #include @@ -21,13 +21,10 @@ #include #include - -namespace SOFIE{ - -//typedef RTensor tensor_t; +namespace SOFIE { enum class ETensorType{ - UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive + UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16 }; @@ -39,7 +36,7 @@ constexpr size_t GetTypeSize(ETensorType type) { switch (type) { case ETensorType::FLOAT: return sizeof(float); case ETensorType::DOUBLE: return sizeof(double); - case ETensorType::UNINT8: return sizeof(uint8_t); + case ETensorType::UINT8: return sizeof(uint8_t); case ETensorType::INT8: return sizeof(int8_t); case ETensorType::UINT16: return sizeof(uint16_t); case ETensorType::INT16: return sizeof(int16_t); @@ -58,6 +55,9 @@ typedef std::int64_t int_t; std::string ConvertTypeToString(ETensorType type); ETensorType ConvertStringToType(std::string type); +// find if a string represents a number +bool IsInteger(const std::string & s); + struct Dim{ bool isParam = false; size_t dim = 0; @@ -67,16 +67,42 @@ struct Dim{ Dim() {} // constructor for a parametric dimension with the option to pass a default dim value - Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) {} + // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2") + // in case the string represents a number make Dim not parametric + Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) + { + if (IsInteger(p)) { + isParam = false; + dim = std::stoi(p); + } + } // constructor for a non-parametric dimension Dim(size_t d) : dim(d) {} std::string GetVal() const { - return (isParam) ? param : std::to_string(dim); + // cast to int64_t for negative shape values + return (isParam) ? param : std::to_string(static_cast(dim)); + } + + std::ostream& operator<< (std::ostream& os) const { + os << GetVal(); + return os; + } + + bool operator==(const Dim& rhs) const { + return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim; + } + bool operator!=(const Dim& rhs) const { + return !(*this == rhs); } }; +//bool operator==(const Dim& lhs, const Dim& rhs); +inline std::ostream & operator<< (std::ostream &os, const Dim &d) { + os << d.GetVal(); + return os; +} struct InputTensorInfo{ ETensorType type; @@ -93,6 +119,18 @@ struct DynamicTensorInfo{ std::vector shape; }; +// template traits for Tensor Shape +template +struct TensorShape {}; +template<> +struct TensorShape { + static bool IsDim() { return true; } +}; +template<> +struct TensorShape { + static bool IsDim() { return false; } +}; + // template traits for Tensor type template struct TensorType {}; @@ -120,6 +158,18 @@ template<> struct TensorType { static const std::string Name() { return "uint64_t"; } }; +template<> +struct TensorType { + static const std::string Name() { return "bool"; } +}; +template<> +struct TensorType { + static const std::string Name() { return "int8_t"; } +}; +template<> +struct TensorType { + static const std::string Name() { return "uint8_t"; } +}; struct TensorMemoryInfo { std::string_view tensor_name; @@ -148,47 +198,85 @@ struct MemoryPoolInfo { std::map available_stack; }; -std::vector ConvertShapeToDim(std::vector shape); +std::vector ConvertShapeToDim(const std::vector & shape); -std::vector ConvertShapeToInt(std::vector shape); +std::vector ConvertShapeToInt(const std::vector & shape); -std::size_t ConvertShapeToLength(std::vector shape); +std::size_t ConvertShapeToLength(const std::vector & shape); +std::size_t ConvertShapeToLength(const std::vector & shape); -std::string ConvertShapeToString(std::vector shape); -std::string ConvertDynamicShapeToString(std::vector shape); -// std::string ConvertShapeToString(std::vector shape) { -// return ConvertDynamicShapeToString(shape); -// } +std::string ConvertShapeToString(const std::vector & shape); +std::string ConvertDimShapeToString(const std::vector & shape); + +std::string ConvertDimShapeToLength(const std::vector & shape); -std::string ConvertDynamicShapeToLength(std::vector shape); template std::string ConvertValToString(T value) { std::stringstream ret; - if (std::is_floating_point_v) - ret << std::setprecision(std::numeric_limits::max_digits10); - ret << value; + ret << std::to_string(value); + return ret.str(); +} +// float specialization +template<> +inline std::string ConvertValToString(float value) { + std::stringstream ret; + // special case for infinity and Nan + if (std::isinf(value)) + ret << (value > 0 ? "std::numeric_limits::infinity()" : + "-std::numeric_limits::infinity()"); + else if (std::isnan(value)) + ret << "std::numeric_limits::quiet_NaN()"; + else { + ret << std::setprecision(std::numeric_limits::max_digits10); + ret << value; + } + return ret.str(); +} +// double specialization +template<> +inline std::string ConvertValToString(double value) { + std::stringstream ret; + // special case for infinity and Nan + if (std::isinf(value)) + ret << (value > 0 ? "std::numeric_limits::infinity()" : + "-std::numeric_limits::infinity()"); + else if (std::isnan(value)) + ret << "std::numeric_limits::quiet_NaN()"; + else { + ret << std::setprecision(std::numeric_limits::max_digits10); + ret << value; + } + return ret.str(); +} +// int64_t specialization for INT64_MIN +template<> +inline std::string ConvertValToString(int64_t value) { + std::stringstream ret; + if (value == INT64_MIN) + ret << "INT64_MIN"; + else + ret << std::to_string(value); return ret.str(); } // convert list of values in a string taking into account the precision template -std::string ConvertValuesToString(size_t n, const T * data) { +std::string ConvertValuesToString(size_t n, const T * data, size_t maxprint = -1) { std::stringstream ret; ret << "{ "; - for (size_t i = 0; i < n; i++) { - if (std::is_floating_point_v) - ret << std::setprecision(std::numeric_limits::max_digits10); - ret << data[i]; + for (size_t i = 0; i < std::min(n,maxprint); i++) { + ret << ConvertValToString(data[i]); if (i < n-1) ret << ", "; + if (i < n-1 && i == maxprint-1) ret << "..... "; } ret << "}"; return ret.str(); } template -std::string ConvertValuesToString(const std::vector & data) { - return ConvertValuesToString(data.size(), data.data()); +std::string ConvertValuesToString(const std::vector & data, size_t maxprint = 5) { + return ConvertValuesToString(data.size(), data.data(), maxprint); } class InitializedTensor { @@ -204,10 +292,18 @@ public: std::shared_ptr const &sharedptr() const { return fData; } // query if tensor comes from a Constant operator bool IsConstantTensor() const { return fConstant;} - // query if tensor needs to be written in a weight file. Constant tensors are not written in a file + // query if tensor needs to be written in a weight file. Constant tensors are not written in a separate file bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;} + // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor) + // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in + // the generated code + bool IsNotWritable() const { return fIsNotWritable; } // set not writable initialized tensors - i.e. tensor that must not be written in a file void SetNotWritable() { fIsNotWritable = true;} + // set writable initialized tensors - i.e. tensor that must be written in a file + void SetWritable() { fIsNotWritable = false;} + // set as constant (needed for non-float initialized tensors) + void SetConstant() { fConstant = true;} template T const *data() const @@ -223,16 +319,8 @@ public: for (std::size_t item : fShape) { fSize *= static_cast(item); } - switch (fType) { - case ETensorType::FLOAT: fSize *= sizeof(float); break; - case ETensorType::DOUBLE: fSize *= sizeof(double); break; - case ETensorType::INT32: fSize *= sizeof(int32_t); break; - case ETensorType::INT64: fSize *= sizeof(int64_t); break; - case ETensorType::BOOL: fSize *= sizeof(bool); break; - default: - throw std::runtime_error("TMVA::SOFIE doesn't yet supports serialising data-type " + - ConvertTypeToString(fType)); - } + // get size in bytes + fSize *= GetTypeSize(fType); fPersistentData = static_cast(fData.get()); } void CastPersistentToShared() @@ -271,7 +359,7 @@ private: template ETensorType GetTemplatedType(T /*obj*/ ){ if (std::is_same::value) return ETensorType::FLOAT; - if (std::is_same::value) return ETensorType::UNINT8; + if (std::is_same::value) return ETensorType::UINT8; if (std::is_same::value) return ETensorType::INT8; if (std::is_same::value) return ETensorType::UINT16; if (std::is_same::value) return ETensorType::INT16; @@ -287,6 +375,12 @@ ETensorType GetTemplatedType(T /*obj*/ ){ } namespace UTILITY{ + + + +// clean operator and tensor names +std::string Clean_name(std::string input_tensor_name); + // Check if two shapes are equal bool AreSameShape(const std::vector&, const std::vector&); bool AreSameShape(const std::vector&, const std::vector&); @@ -296,10 +390,14 @@ bool AreSameShape(const std::vector&, const std::vector&); // Multidirectional broadcast a list of tensors to the same shape std::vector MultidirectionalBroadcastShape(std::vector>); -// Unidirectional broadcast two shapes to the same shape -std::vector UnidirectionalBroadcastShape(std::vector, std::vector); +// Multidirectional broadcast two shapes to the same shape + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); +std::vector UnidirectionalBroadcastShape(std::vector &, std::vector &); + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); + -std::string Clean_name(std::string input_tensor_name); template T* BroadcastConvBias(const T* data, const size_t channel, const std::vector& targetShape) { @@ -343,16 +441,14 @@ T* BroadcastConvBias(const T* data, const size_t channel, const std::vector, class ContT = std::span > -void BroadcastTensor(ConstContT data, const std::vector& shape, const std::vector& targetShape, ContT broadcastedData) { +template> +void BroadcastTensor(ConstContT data, const std::vector& shape, const std::vector& targetShape, T *broadcastedData) { // Size of the shapes (tensor input here have shapes with same sizes, we have already added the needed ones ) size_t size = shape.size(); // Current length of the broadcasted tensor size_t curLength = data.size(); - size_t targetLength = broadcastedData.size(); - assert(ConvertShapeToLength(targetShape) == targetLength); // special case when broadcasting last dimensions (initial shapes must be the same) - if (shape.front() == targetShape.front() && shape.back() == 1 && size > 1) { + if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) { size_t bsize = targetShape.back(); // compute the size of the data to broadcast for (int k = int(size)-2; k >=0; k--) { @@ -360,16 +456,16 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st bsize *= targetShape[k]; } for (size_t i = 0; i < curLength; i++) { - std::fill(broadcastedData.begin() + i*bsize, broadcastedData.begin() + (i+1)*bsize , data[i]); + std::fill(broadcastedData + i*bsize, broadcastedData + (i+1)*bsize , data[i]); } return; } - std::copy(data.begin(), data.end(), broadcastedData.begin()); + std::copy(data.begin(), data.end(), broadcastedData); // Product of the previous dimensions of targetShape size_t arrayNum = 1; // New broadcasted data: is this needed? - std::vector newData(targetLength); + std::vector newData(ConvertShapeToLength(targetShape)); for (size_t idx = 0; idx < size; idx++) { size_t dim = shape[idx]; @@ -385,8 +481,8 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) { for (size_t targetIdx = 0; targetIdx < targetDim; targetIdx++) { size_t offset = arrayIdx * arrayLength * targetDim + targetIdx * arrayLength; - std::copy(broadcastedData.begin() + arrayIdx * arrayLength, - broadcastedData.begin() + (arrayIdx + 1) * arrayLength, + std::copy(broadcastedData + arrayIdx * arrayLength, + broadcastedData + (arrayIdx + 1) * arrayLength, newData.begin() + offset); } } @@ -400,12 +496,11 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st // Update current length curLength = newLength; // Update broadcasted data - std::copy(newData.begin(), newData.begin() + newLength, broadcastedData.begin()); + std::copy(newData.begin(), newData.begin() + newLength, broadcastedData); } // Update the number of arrays arrayNum *= targetDim; } - //return broadcastedData; } // interface where we allocate a new array for broadcasted data @@ -413,10 +508,8 @@ template T* CreateBroadcastTensor(const T* data, const std::vector& shape, const std::vector& targetShape, size_t targetLength) { // newShape is an array of size equal to dimension along which we are broadcasting the tensor T* broadcastedData = new T[targetLength]; - std::span bData(broadcastedData, broadcastedData+targetLength); size_t curLength = ConvertShapeToLength(shape); - std::span inData(data, curLength); - BroadcastTensor, std::span>(inData, shape, targetShape, bData); + BroadcastTensor({data, curLength}, shape, targetShape, broadcastedData); return broadcastedData; } // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not @@ -429,14 +522,14 @@ T* UnidirectionalBroadcast(const T* data, const std::vector& shape, cons std::vector newShape(targetSize, 1); size_t offset = targetSize - shape.size(); std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape)); + return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape)); } - return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape)); + return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape)); } // Unidirectional broadcasting shape to targetShape using a passed vector to avoid allocations template -void UnidirectionalBroadcast(const T* data, const std::vector& shape, const std::vector& targetShape, std::span broadcastedData) { +void UnidirectionalBroadcast(const T* data, const std::vector& shape, const std::vector& targetShape, T *broadcastedData) { size_t curLength = ConvertShapeToLength(shape); std::span inData(const_cast(data), curLength); // Prepend shape with ones @@ -445,12 +538,10 @@ void UnidirectionalBroadcast(const T* data, const std::vector& shape, co std::vector newShape(targetSize, 1); size_t offset = targetSize - shape.size(); std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - BroadcastTensor(inData, newShape, targetShape, broadcastedData); + BroadcastTensor(inData, newShape, targetShape, broadcastedData); } - BroadcastTensor>(inData, shape, targetShape, broadcastedData); + BroadcastTensor(inData, shape, targetShape, broadcastedData); } -// specialization for vector of boolean -void UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData); /// compute stride of a tensor given its shape (assume layout is row-major) std::vector ComputeStrideFromShape(const std::vector & shape); @@ -619,8 +710,6 @@ void col2im(const Dtype* data_col, const int channels, //std::cout << "finishing col2imp" << std::endl; } - - } // end namespace UTILITY namespace BLAS{ @@ -631,37 +720,37 @@ extern "C" void sgemm_(const char * transa, const char * transb, const int * m, struct GNN_Data { - TMVA::Experimental::RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) - TMVA::Experimental::RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) - TMVA::Experimental::RTensor global_data; // the global features, tensor with shape (1, num_global_features) - TMVA::Experimental::RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) + RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) + RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) + RTensor global_data; // the global features, tensor with shape (1, num_global_features) + RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) // edge_index[0,:] are the receivers and edge_index[1,:] are the senders // need to have default constructor since RTensor has not one - GNN_Data(): node_data(TMVA::Experimental::RTensor({})), edge_data(TMVA::Experimental::RTensor({})), global_data(TMVA::Experimental::RTensor({})), edge_index(TMVA::Experimental::RTensor({})) {} + GNN_Data(): node_data(RTensor({})), edge_data(RTensor({})), global_data(RTensor({})), edge_index(RTensor({})) {} }; template -TMVA::Experimental::RTensor Concatenate( TMVA::Experimental::RTensor & t1, TMVA::Experimental::RTensor & t2, int axis = 0) +RTensor Concatenate( RTensor & t1, RTensor & t2, int axis = 0) { // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis if (t1.GetMemoryLayout() != t2.GetMemoryLayout()) - throw std::runtime_error("TMVA RTensor Concatenate - tensors have different memory layout"); + throw std::runtime_error("RTensor Concatenate - tensors have different memory layout"); auto & shape1 = t1.GetShape(); auto & shape2 = t2.GetShape(); if (t1.GetSize()/shape1[axis] != t2.GetSize()/shape2[axis]) { std::cout << "axis " << axis << " sizes " << t1.GetSize() << " " << t2.GetSize() << " "; std::cout << "shape 1 : " << ConvertShapeToString(t1.GetShape()); std::cout << " shape 2 : " << ConvertShapeToString(t2.GetShape()) << std::endl; - throw std::runtime_error("TMVA RTensor Concatenate - tensors have incompatible shapes"); + throw std::runtime_error("RTensor Concatenate - tensors have incompatible shapes"); } std::vector outShape = shape1; outShape[axis] = shape1[axis] + shape2[axis]; - TMVA::Experimental::RTensor tout(outShape, t1.GetMemoryLayout()); - if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) { - throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors"); + RTensor tout(outShape, t1.GetMemoryLayout()); + if (t1.GetMemoryLayout() == MemoryLayout::ColumnMajor) { + throw std::runtime_error("RTensor Concatenate is not yet supported for column major tensors"); } auto & stride1 = t1.GetStrides(); @@ -693,10 +782,10 @@ inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) { inline GNN_Data Copy(const GNN_Data & data) { GNN_Data out; - out.node_data = TMVA::Experimental::RTensor(data.node_data.GetShape()); - out.edge_data = TMVA::Experimental::RTensor(data.edge_data.GetShape()); - out.global_data = TMVA::Experimental::RTensor(data.global_data.GetShape()); - out.edge_index = TMVA::Experimental::RTensor(data.edge_index.GetShape()); + out.node_data = RTensor(data.node_data.GetShape()); + out.edge_data = RTensor(data.edge_data.GetShape()); + out.global_data = RTensor(data.global_data.GetShape()); + out.edge_index = RTensor(data.edge_index.GetShape()); std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData()); std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData()); std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData()); @@ -704,6 +793,136 @@ inline GNN_Data Copy(const GNN_Data & data) { return out; } -}//SOFIE +inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A, + const float *B, float beta, const float *C) +{ + char ct = 't'; + char cn = 'n'; + const int *lda = transa ? &k : &m; + const int *ldb = transb ? &n : &k; + const int *ldc = &m; + if (C != nullptr) { + std::copy(C, C + m * n, output); + } + BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb, + &beta, output, ldc); +} + +inline void Fill(float *output, float value, int size) +{ + std::fill(output, output + size, value); +} + +template +inline void Copy(T *output, T const *input, int size) +{ + std::copy(input, input + size, output); +} + +inline void Relu(float *output, float const *input, int size) +{ + for (int i = 0; i < size; i++) { + output[i] = (input[i] > 0.0f) ? input[i] : 0.0f; + } +} +// function to read float from the file dealing with inf and nan values +inline float ParseFloatToken (const std::string & s) { + if (s == "inf") return std::numeric_limits::infinity(); + if (s == "-inf") return -std::numeric_limits::infinity(); + if (s == "nan") return std::numeric_limits::quiet_NaN(); + return std::stof(s); +} + +template +void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength) +{ + std::string name; + std::size_t length; + is >> name >> length; + if (name != expectedName) { + std::string err_msg = + "sofie failed to read the correct tensor name; expected name is " + expectedName + " , read " + name; + throw std::runtime_error(err_msg); + } + if (length != expectedLength) { + std::string err_msg = "sofie failed to read the correct tensor size; expected size is " + + std::to_string(expectedLength) + " , read " + std::to_string(length); + throw std::runtime_error(err_msg); + } + std::string token; + for (size_t i = 0; i < length; ++i) { + is >> token; + target[i] = ParseFloatToken(token); + } + if (is.fail()) { + throw std::runtime_error("sofie failed to read the values for tensor " + expectedName); + } +} + +//Utility functions to generate code +void EmitNestedLoops(std::stringstream &out, size_t loopRank, const std::vector shape); +void CloseNestedLoops(std::stringstream &out, size_t loopRank); + + +// code for the memory greeding allocations +struct TensorLifeInfo { + int begin; // start time (op index) lifetime + int end; // end time lifetime + size_t size; // size of tensors in bytes +}; + +struct MemoryResult { + std::size_t total_bytes = 0; // total memory needed + std::vector offsets; // resulted offsets for each tensor +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ); + +// Simple Dimension classes ans helpers to add constexpr meta info on input +// tensors to the emitted code. +struct SingleDim { + enum class Kind { + Static, + Symbolic + }; + + Kind kind; + std::size_t dim; + std::string_view name; + + constexpr SingleDim(std::size_t v) : kind(Kind::Static), dim(v), name() {} + constexpr SingleDim(const char *v) : kind(Kind::Symbolic), dim(0), name(v) {} +}; + +struct TensorDims { + const SingleDim *data; + std::size_t size; + + constexpr std::size_t total_size() const + { + std::size_t result = 1; + for (std::size_t i = 0; i < size; ++i) { + result *= data[i].dim; + } + return result; + } +}; + +template +constexpr TensorDims makeDims(Arr const &arr) +{ + return TensorDims{arr.data(), arr.size()}; +} + +inline std::string ConvertOutputTypeToString(ETensorType t) { + // The std::vector is a special type that is not wrapping continuous memory. + // We don't want to use it as a return type. + if (t == ETensorType::BOOL) t = ETensorType::UINT8; + return ConvertTypeToString(t); +} + + +} // namespace SOFIE -#endif //TMVA_SOFIE_RMODEL +#endif //TMVA_SOFIE_COMMON diff --git a/src/SOFIE_core/src/Prototype.cxx b/core/src/Prototype.cxx similarity index 100% rename from src/SOFIE_core/src/Prototype.cxx rename to core/src/Prototype.cxx diff --git a/src/SOFIE_core/src/RFunction.cxx b/core/src/RFunction.cxx similarity index 100% rename from src/SOFIE_core/src/RFunction.cxx rename to core/src/RFunction.cxx diff --git a/src/SOFIE_core/src/RFunction_MLP.cxx b/core/src/RFunction_MLP.cxx similarity index 91% rename from src/SOFIE_core/src/RFunction_MLP.cxx rename to core/src/RFunction_MLP.cxx index eff76f6..5666f3e 100644 --- a/src/SOFIE_core/src/RFunction_MLP.cxx +++ b/core/src/RFunction_MLP.cxx @@ -10,13 +10,13 @@ namespace SOFIE { -RFunction_MLP::RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation activation_function, bool activate_final, GraphType gType): +RFunction_MLP::RFunction_MLP(FunctionTarget target, int_t numLayers, Activation activation_function, bool activate_final, GraphType gType): RFunction_Update(target, gType), fNumLayers(numLayers), fActivationFunction(activation_function), fActivateFinal(activate_final) { // assuming all the linear layers has a kernel and a bias initialized tensors if (fActivateFinal) { if (fActivationFunction == Activation::Invalid) { - throw std::runtime_error("TMVA SOFIE GNN doesn't currently supports the provided activation function for " + + throw std::runtime_error("SOFIE GNN doesn't currently supports the provided activation function for " + fFuncName + " update."); } function_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)}); @@ -43,12 +43,12 @@ void RFunction_MLP::Initialize() { double beta = (fBiasTensors[i].empty()) ? 0. : 1.; op_gemm.reset(new ROperator_Gemm(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors[i]),UTILITY::Clean_name(fBiasTensors[i]),fFuncName+"Gemm"+std::to_string(i))); function_block->AddOperator(std::move(op_gemm)); - fGemmInput = fFuncName+"Gemm"+i; + fGemmInput = fFuncName+"Gemm"+std::to_string(i); if (fActivationFunction == Activation::RELU) { std::unique_ptr op_relu; op_relu.reset(new ROperator_Relu(fFuncName+"Gemm"+std::to_string(i), fFuncName+"Relu"+std::to_string(i))); function_block->AddOperator(std::move(op_relu)); - fGemmInput = fFuncName+"Relu"+i; + fGemmInput = fFuncName+"Relu"+std::to_string(i); } } diff --git a/src/SOFIE_core/src/RFunction_Mean.cxx b/core/src/RFunction_Mean.cxx similarity index 100% rename from src/SOFIE_core/src/RFunction_Mean.cxx rename to core/src/RFunction_Mean.cxx diff --git a/src/SOFIE_core/src/RFunction_Sum.cxx b/core/src/RFunction_Sum.cxx similarity index 100% rename from src/SOFIE_core/src/RFunction_Sum.cxx rename to core/src/RFunction_Sum.cxx diff --git a/core/src/RModel.cxx b/core/src/RModel.cxx new file mode 100644 index 0000000..6e9267b --- /dev/null +++ b/core/src/RModel.cxx @@ -0,0 +1,1987 @@ +#include +#include +#include +#include +#include + +#ifdef SOFIE_SUPPORT_ROOT_BINARY +#include "TFile.h" +#endif + +#include "SOFIE/RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" + +namespace SOFIE { + +namespace { +const std::string SP = " "; + +void ReplaceAll(std::string &str, const std::string &from, const std::string &to) +{ + size_t pos = 0; + while ((pos = str.find(from, pos)) != std::string::npos) { + str.replace(pos, from.length(), to); + pos += to.length(); + } +} + +bool IsIdentifierChar(char c) +{ + return std::isalnum(static_cast(c)) || c == '_'; +} + +// Returns true if s is a valid C++ identifier (can be used as a variable name). +// Dim::param can be either a plain name (e.g. "W") or a computed expression +// (e.g. "((W+-3)/2+1)"); only the former can be used as a C++ variable name. +bool IsIdentifier(const std::string &s) +{ + if (s.empty() || std::isdigit(static_cast(s[0]))) + return false; + for (char c : s) + if (!IsIdentifierChar(c)) + return false; + return true; +} + +// Get the data member name corresponding to a tensor with a given name. +std::string TensorMember(std::string const &name) +{ + return "tensor_" + name; +} + +} // namespace + +std::vector RModel::GetTensorShape(const std::string & name) const { + auto f = fReadyInputTensorInfos.find(name); + if (f != fReadyInputTensorInfos.end()) { + return f->second.shape; + } + auto f2 = fInitializedTensors.find(name); + if (f2 != fInitializedTensors.end()) { + return f2->second.shape(); + } + auto f3 = fInputTensorInfos.find(name); + if (f3 != fInputTensorInfos.end()) { + throw std::runtime_error("SOFIE tensor [" + name + "] is an input tensor with unspecified dimension parameter"); + } + auto f4 = fIntermediateTensorInfos.find(name); + if (f4 != fIntermediateTensorInfos.end()) { + return f4->second.shape; + } + // case of shape tensors + auto f5 = fShapeTensors.find(name); + if (f5 != fShapeTensors.end()) { + // shape is vector of size 1 with size of shape values or just a scalar + if (f5->second.second) // check scalar flag + return std::vector{}; + else + return std::vector{f5->second.first.size()}; + } + + if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) + throw std::runtime_error("SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape"); + + if (fIsSubGraph && fParentGraph) + return fParentGraph->GetTensorShape(name); + + throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not found"); +} + +std::vector RModel::GetDimTensorShape(const std::string & name) const { + if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { + return f->second.shape; + } + if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { + return f->second.shape; + } + // in case is not a dynamic tensor convert normal shape to Dim one + // for this we need to return the vector by value + return ConvertShapeToDim(GetTensorShape(name)); +} +std::vector RModel::GetDynamicTensorShape(const std::string & name) const { + if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { + return f->second.shape; + } + if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { + return f->second.shape; + } + // throw error if shape is not dynamic + if (!IsDynamicTensor(name)) + throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not dynamic"); + + throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not found"); +} + +ETensorType RModel::GetTensorType(std::string name) const { + auto f = fReadyInputTensorInfos.find(name); + if (f != fReadyInputTensorInfos.end()) { + return f->second.type; + } + auto f2 = fInitializedTensors.find(name); + if (f2 != fInitializedTensors.end()) { + return f2->second.type(); + } + auto f3 = fInputTensorInfos.find(name); + if (f3 != fInputTensorInfos.end()) { + return f3->second.type; + } + auto f4 = fIntermediateTensorInfos.find(name); + if (f4 != fIntermediateTensorInfos.end()) { + return f4->second.type; + } + auto f5 = fDynamicTensorInfos.find(name); + if (f5 != fDynamicTensorInfos.end()){ + return f5->second.type; + } + // case of shape tensor type is INT64 + if (fShapeTensors.find(name) != fShapeTensors.end()){ + return ETensorType::INT64; + } + + if (fIsSubGraph && fParentGraph) + return fParentGraph->GetTensorType(name); + + throw std::runtime_error("SOFIE tensor [" + name + "] for which the type is requested is not found, model name: " + fName); +} + +bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) { + if (fReadyInputTensorInfos.find(tensor_name) != fReadyInputTensorInfos.end()) return true; + if (fInputTensorInfos.find(tensor_name) != fInputTensorInfos.end()) return true; + if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true; + if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true; + if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true; + if (fShapeTensors.find(tensor_name) != fShapeTensors.end()) return true; + if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name); + return false; +} + +void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { + input_name = UTILITY::Clean_name(input_name); + if (CheckIfTensorAlreadyExist(input_name)) { + throw std::runtime_error("sofie: input tensor with name " + input_name + " already exists \n"); + } + + InputTensorInfo inputInfo { type, shape }; + fInputTensorInfos[input_name] = inputInfo; +} + +void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { + input_name = UTILITY::Clean_name(input_name); + if (CheckIfTensorAlreadyExist(input_name)) { + throw std::runtime_error("sofie: input tensor with name " + input_name + " already exists \n"); + } + TensorInfo inputInfo { type, shape }; + fReadyInputTensorInfos[input_name] = inputInfo; +} + +void RModel::AddInputTensorName(std::string input_name) { + fInputTensorNames.emplace_back(UTILITY::Clean_name(input_name)); +} + +void RModel::AddOperator(std::unique_ptr op, int order_execution) +{ + AddBlasRoutines(op->GetBlasRoutines()); + auto libs = op->GetStdLibs(); + auto op_input_tensors = op->GetOpInputTensors(); + for (auto &stdlib : libs) { + AddNeededStdLib(stdlib); + } + if (order_execution >= 0) { + fOperators.insert(fOperators.begin() + order_execution, std::move(op)); + } else { + fOperators.push_back(std::move(op)); + order_execution = fOperators.size() - 1; + } + + // storing the last usage of tensors which are input to the operator + // (excluding tensors which are inputs to the model or the initialized (weights) tensors) + // We call this function during parsing so we don't have yet initialized the operators + for (size_t index = 0; index < op_input_tensors.size(); index++) { + if (!IsInitializedTensor(UTILITY::Clean_name(std::string(op_input_tensors[index]))) && + std::find(fInputTensorNames.begin(), fInputTensorNames.end(), + UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end()) { + + fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution; + if (Verbose()) + std::cout << "adding order execution for " << op_input_tensors[index] << " order " << order_execution + << std::endl; + } + } +} + +void RModel::AddInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { + tensor_name = UTILITY::Clean_name(tensor_name); + //NB: own data + if (CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: initialized tensor with name " + tensor_name + " already exists \n"); + } + InitializedTensor new_tensor {type, shape, data}; + fInitializedTensors[tensor_name] = new_tensor; +} + +void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { + tensor_name = UTILITY::Clean_name(tensor_name); + //NB: own data + if (CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: constant tensor with name " + tensor_name + " already exists \n"); + } + InitializedTensor new_tensor {type, shape, data, true}; // add here flag to specify is a constant tensor + fInitializedTensors[tensor_name] = new_tensor; +} + +void RModel::AddShapeTensor(const std::string & name, const std::vector & shape_values, bool scalar){ + auto tensor_name = UTILITY::Clean_name(name); + if (fShapeTensors.count(tensor_name) != 0) { + throw std::runtime_error("sofie: shape tensor with name " + tensor_name + " already exists \n"); + } + fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar); +} + +void RModel::AddAliasTensor(const std::string & name, const std::string & origin){ + // add an alias tensor to origin + auto tensor_name = UTILITY::Clean_name(name); + auto origin_name = UTILITY::Clean_name(origin); + if (fAliasTensors.count(tensor_name) != 0) { + throw std::runtime_error("sofie: alias tensor with name " + tensor_name + " already exists \n"); + } + fAliasTensors[tensor_name] = origin_name; +} + +bool RModel::IsShapeTensor(const std::string & tensor_name) const { + return fShapeTensors.count(tensor_name) != 0; +} + +bool RModel::IsAliasTensor(const std::string & tensor_name) const { + return fAliasTensors.count(tensor_name) != 0; +} + +const std::vector & RModel::GetShapeTensorValues(const std::string & tensor_name) const { + //if (!IsShapeTensor(tensor_name) ) return std::vector{}; + return fShapeTensors.at(tensor_name).first; +} + +bool RModel::IsInitializedTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + return fInitializedTensors.find(name) != fInitializedTensors.end(); +} +bool RModel::IsConstantTensor(const std::string& tensorName) const { + // a constant tensor is an initialized tensor but has the constant flag set + std::string name = UTILITY::Clean_name(tensorName); + auto itr = fInitializedTensors.find(name); + if (itr == fInitializedTensors.end()) return false; + return itr->second.IsConstantTensor(); +} + +// dynamic tensors include also Dim input tensors +bool RModel::IsDynamicTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + bool ret = fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); + return (ret) ? true : IsDimInputTensor(tensorName); +} +bool RModel::IsDimInputTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + return fInputTensorInfos.find(name) != fInputTensorInfos.end(); +} +bool RModel::IsReadyInputTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + return fReadyInputTensorInfos.find(name) != fReadyInputTensorInfos.end(); +} + +// generic addition of a tensor +void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape) { + auto int_shape = ConvertShapeToInt(dim_shape); + if (!int_shape.empty()) + AddIntermediateTensor(tensor_name, type, int_shape); + else + AddDynamicTensor(tensor_name, type, dim_shape); +} + +void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape) { + tensor_name = UTILITY::Clean_name(tensor_name); + if (CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: intermediate tensor with name " + tensor_name + " already exists \n"); + } + TensorInfo new_tensor {type, shape}; + fIntermediateTensorInfos[tensor_name] = new_tensor; +} + +void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape){ + tensor_name = UTILITY::Clean_name(tensor_name); + if (CheckIfTensorAlreadyExist(tensor_name)){ + throw std::runtime_error("sofie: intermediate tensor with name " + tensor_name + " already exists \n"); + } + DynamicTensorInfo new_tensor {type, shape}; + fDynamicTensorInfos[tensor_name] = new_tensor; + // store shape parameter if not existing + for (auto &d : shape) { + if (d.isParam) { + if (d.dim != size_t(-1)) { + AddShapeParam(d.param, d.dim); + } + } + } +} + +void RModel::AddShapeParam(const std::string & param, size_t default_value) { + if (fShapeParams.count(param) == 0) { + fShapeParams[param] = std::to_string(default_value); + // add also in the vector list (used to keep the order) + fDimShapeNames.push_back(param); + } +} + +void RModel::AddOutputTensorNameList(std::vector outputtensornames) { + fOutputTensorNames.clear(); + for(auto& it : outputtensornames) { + fOutputTensorNames.emplace_back(UTILITY::Clean_name(it)); + } +} + +void RModel::UpdateOutputTensorList(std::vector curr_output_tensors, std::vector new_output_tensors) { + for(auto& it:curr_output_tensors) { + fOutputTensorNames.erase(std::remove(fOutputTensorNames.begin(), fOutputTensorNames.end(), it), fOutputTensorNames.end()); + } + fOutputTensorNames.insert(fOutputTensorNames.end(), new_output_tensors.begin(), new_output_tensors.end()); +} + +void RModel::UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { + tensor_name = UTILITY::Clean_name(tensor_name); + if (!CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to update it"); + } + InitializedTensor new_tensor {type, shape, data}; + fInitializedTensors[tensor_name] = new_tensor; +} + +std::shared_ptr RModel::GetInitializedTensorData(std::string tensor_name) { + auto f = fInitializedTensors.find(tensor_name); + if (f == fInitializedTensors.end()) { + throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to get its data"); + } else { + return f->second.sharedptr(); + } +} + +void RModel::RemoveInitializedTensor(std::string tensor_name) { + auto f = fInitializedTensors.find(tensor_name); + if (f == fInitializedTensors.end()) { + throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to remove it"); + } else { + fInitializedTensors.erase(f); + } +} + +void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) { + auto t = fInitializedTensors.find(tensor_name); + if (t == fInitializedTensors.end()) { + throw std::runtime_error("sofie: initialized tensor " + tensor_name + " not found when trying to get its info"); + } + t->second.SetNotWritable(); + } + +std::string RModel::AllocateIntermediateMemory(std::span op_output_tensors) +{ + std::stringstream code; + + if (fVerbose) { + std::cout << "Total chunks allocated\n"; + for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl; + } + } + + auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) { + std::string typeName = ConvertTypeToString(GetTensorType(name)); + code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes"; + code << "\n" + << typeName << "* " << TensorMember(name) << " = reinterpret_cast<" << typeName + << "*>(fIntermediateMemoryPool.data() + " << location << ");\n"; + }; + + if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n"; + // order output tensors by size + std::vector ordered_output_tensors; + + for (auto &it : op_output_tensors) { + auto name = std::string(it); + if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() || + fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) + continue; + + // case of alias tensor + if (IsAliasTensor(name)) { + continue; + } + + auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name)); + // important fill the pair in the ordered output tensors with the string view and not the string + TensorMemoryInfo tmi = {it, tensor_size}; + ordered_output_tensors.push_back(tmi); + } + std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(), + [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; }); + + for (auto &it : ordered_output_tensors) { + bool allocated = false; + std::string name = std::string{it.tensor_name}; + size_t tensor_size = it.tensor_size; + if (fVerbose) + std::cout << "output tensor " << name << " size " << tensor_size << std::endl; + + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end();) { + + if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second; + // check if available memory chunks can accommodate the tensor + if (chunk->second >= tensor_size) { + // need to use here string_view (i.e it.tensor_name) + // split returns the new chunk with size of new tensor. The free chunk is before the used one + auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size); + auto new_chunk_location = chunk->first + chunk->second - tensor_size; + fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; + + declareIntermediateTensor(name, tensor_size, new_chunk_location); + chunk->second -= tensor_size; + + allocated = true; + + if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location; + + if (chunk->second == 0) { + if (fVerbose) std::cout << " and deleted since size matches"; + fIntermediateMemoryInfo.available_stack.erase(chunk); + } + if (fVerbose) std::cout << std::endl; + break; + } else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first && + fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) { + // case last available chunk is the last in the memory, we can increase that one + fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size}; + declareIntermediateTensor(name, tensor_size, chunk->first); + fIntermediateMemoryInfo.available_stack.erase(chunk); + allocated = true; + if (fVerbose) std::cout << " is extended with a bigger one of size " << tensor_size << std::endl; + break; + } + ++chunk; + if (fVerbose) std::cout << std::endl; + } + + if (!allocated) { + size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() + ? 0 + : fIntermediateMemoryInfo.total_stack.rbegin()->first + + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; + + fIntermediateMemoryInfo.total_stack[chunk_idx] = it; + + declareIntermediateTensor(name, tensor_size, chunk_idx); + + if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx + << std::endl; + } + } + return code.str(); +} + +void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ + if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n"; + //print available chunks + if (fVerbose) std::cout << "available chunks before freeing them : \n"; + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) { + if (fVerbose) std::cout << "-- free chunk " << chunk->first << " size = " << chunk->second << std::endl; + } + for (auto &iv : op_input_tensors) { + // last occurrence of the tensor is reached => flush it from memory + if (fVerbose) std::cout << ".. input tensors : " << iv; + + // for alias tensors replace name with its alias + std::string it{iv}; // convert view to string + if (IsAliasTensor(it)) + it = fAliasTensors[it]; + if (fIntermediateTensorFrequencyLookup[it] == op_idx) { + if (fVerbose) std::cout << " flash condition is met - looping on chunks to find matching one \n"; + for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); + chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + if (fVerbose) std::cout << "--- chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size; + if (chunk->second.tensor_name == it) { + if (fVerbose) std::cout << " -- Found chunk corresponding to input tensor: " << chunk->first; + // check if nearby chunks in available memory can coalesce + auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound( + chunk->first); // smallest element greater than the flushed chunk idx + auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) + ? fIntermediateMemoryInfo.available_stack.end() + : std::prev(first_greater); // largest element smaller than the flushed chunk idx + + // check if the next stack entry is actually adjacent in memory + + if (last_smaller != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == chunk->first) { + // merge chunk with previous one + last_smaller->second += chunk->second.tensor_size; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); + if (fVerbose) std::cout << " is adjacent in memory with previous one - merge "; + if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == first_greater->first) { + // merge also with following one + last_smaller->second += first_greater->second; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater->first]); + // delete merged one in available stack and in total stack + fIntermediateMemoryInfo.total_stack.erase(first_greater->first); + fIntermediateMemoryInfo.available_stack.erase(first_greater); + if (fVerbose) std::cout << " merge also with following that is free "; + } + fIntermediateMemoryInfo.total_stack.erase(chunk->first); + if (fVerbose) std::cout << std::endl; + break; + } else if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + chunk->first + chunk->second.tensor_size == first_greater->first) { + // merge with first greater + if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n"; + // cannot modify idx of first_greter. Insert a new one and delete previous one + size_t new_size = chunk->second.tensor_size + first_greater->second; + size_t first_greater_idx = first_greater->first; + fIntermediateMemoryInfo.available_stack.erase(first_greater); + // cannot use anymore first_greater + fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size}); + fIntermediateMemoryInfo.total_stack[chunk->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater_idx]); + fIntermediateMemoryInfo.total_stack.erase(first_greater_idx); + } else { + fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size}); + if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl; + } + chunk->second.tensor_name = "free"; + break; + } + } + } else { + if (fVerbose) std::cout << std::endl; + } + } +} + +void RModel::Initialize(int batchSize, bool verbose) { + std::map inputParams; + if (batchSize > 0) { + inputParams["input_size"] = batchSize; + inputParams["batch_size"] = batchSize; + inputParams["bs"] = batchSize; + } + Initialize(inputParams, verbose); + fIntermediateMemoryInfo = MemoryPoolInfo(); +} +void RModel::Initialize(const std::map & inputParams, bool verbose) { + + fVerbose = int(verbose); + + if (fIsInitialized) { + if (verbose) + std::cout << "Model is already initialized - skip initialization " << std::endl; + return; + } + fIntermediateTensorInfos.clear(); + fDynamicTensorInfos.clear(); + + + // loop on inputs and see if shape can be full specified + // if the batch size is provided it can be used to specify the full shape + // Add the full specified tensors in fReadyInputTensors collection + auto originalInputTensorInfos = fInputTensorInfos; // need to copy because we may delete elements + for (auto &input : originalInputTensorInfos) { + if (verbose) std::cout << "looking at the tensor " << input.first << std::endl; + // if a parameter (e.g. batch_size) is specified use for converting parametric shape in defined one + if (!inputParams.empty()) { + for (auto &d : input.second.shape) { + if (d.isParam) { + std::string pname = d.param; + if (pname == input.first + "_size") pname = "input_size"; + auto itr = inputParams.find(pname); + if (itr != inputParams.end() ) { + d = Dim{ itr->second }; + if (verbose) + std::cout << "Tensor: " << input.first << " - fix parametric shape " << itr->first << " to " << itr->second << std::endl; + } + } + } + } + // see if shape now is fully defined + auto shape = ConvertShapeToInt(input.second.shape); + if (verbose) + std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from " + << ConvertDimShapeToString(input.second.shape) << std::endl; + if (!shape.empty()) { + // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and + // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored + fInputTensorInfos.erase(input.first); + // add to the ready input tensor information the new fixed shape + AddInputTensorInfo(input.first, input.second.type, shape); + // check consistency + assert( fReadyInputTensorInfos.size() + fInputTensorInfos.size() == fInputTensorNames.size()); + } + // store the parameters of the input tensors + else { + // store the found parametric shape parameters + for (auto &d : input.second.shape) { + if (d.isParam) { + if (fShapeParams.count(d.param) == 0) { + fDimShapeNames.push_back(d.param); + fShapeParams[d.param] = std::to_string(d.dim); + } + } + } + } + } + + if (verbose) { + PrintRequiredInputTensors(); + PrintDynamicTensors(); + } + + // Go through model and initialize each operator + int i = 0; + + std::vector temp_available_stack; // vector stores individual chunks of available memory that maybe reused + + // Build set of initialized tensors consumed by at least one runtime operator (need for later) + std::unordered_set runtimeInitializedInputs; + for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){ + if (verbose) { + auto& r = *fOperators[op_idx].get(); + std::cout << "Initializing operator " << i << " " << typeid(r).name() << std::endl; + } + fOperators[op_idx]->Initialize(*this); + for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ + std::string name = std::string{it}; + // check if tensor is not an initialized or output tensor and it is not already in the list + if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && + std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() && + fInitializedTensors.find(name) == fInitializedTensors.end()) + { + fIntermediateTensorFrequencyLookup[it] = op_idx; + } + } + // loop for non-constant operators and flag the inputs which are initialized tensors to make sure they are writable + if (!fOperators[op_idx]->IsOutputConstant()) { + for (auto &it : fOperators[op_idx]->GetOpInputTensors()) { + std::string name = std::string{it}; + if (fInitializedTensors.find(name) != fInitializedTensors.end()) { + runtimeInitializedInputs.insert(name); + } + } + } + + i++; + } + + // loop on initialized tensors and make the integers as constant to be + // not written in a weight file and check if the tensors flagged as not writable are really not writable, + // i.e. are not used by non constant operators + for (auto &it : fInitializedTensors) { + // check if not-writable tensors are really not writable, i.e. are not used by non constant operators + if (it.second.IsNotWritable() && runtimeInitializedInputs.find(it.first) != runtimeInitializedInputs.end()) { + it.second.SetWritable(); + if (verbose) { + std::cout << "Initialized tensor " << it.first << " is flagged as not writable but is used by non constant operators, set it as writable \n"; + } + } + // if the tensor is an integer we can flag it as constant since it will not be written in a weight file and it is considered equivalent as being created from a Constant operator + // only FLOAT tensors are written in a weight file + if (it.second.type() != ETensorType::FLOAT) { + it.second.SetConstant(); + } + } + + // check if there are initialized tensors to write in a weight file + if (fUseWeightFile) { + bool modelHasWeights = false; + for (auto &it : fInitializedTensors) { + if (it.second.IsWeightTensor()) { + modelHasWeights = true; + break; + } + } + if (!modelHasWeights) + fUseWeightFile = false; + } + + // update fIntermediateTensorFrequencyLookup for alias tensors + for (auto & it : fAliasTensors) { + if (fIntermediateTensorFrequencyLookup.find(it.first) == fIntermediateTensorFrequencyLookup.end()) continue; + if (fIntermediateTensorFrequencyLookup.find(it.second) == fIntermediateTensorFrequencyLookup.end() ) + fIntermediateTensorFrequencyLookup[it.second] = fIntermediateTensorFrequencyLookup[it.first]; + else { + // take the largest one + fIntermediateTensorFrequencyLookup[it.second] = std::max(fIntermediateTensorFrequencyLookup[it.second],fIntermediateTensorFrequencyLookup[it.first] ); + } + } + + fIsInitialized = true; +} + +void RModel::InitializeSubGraph(std::shared_ptr graph) { + // add the subgraph to the list + fSubGraphs.push_back(graph); + //this needs to be done before initializing + graph->fParentGraph = this; + graph->fIsSubGraph = true; + + graph->Initialize(fBatchSize, fVerbose); + // set the same options as parent model + graph->fWeightFile = fWeightFile; + graph->fUseWeightFile = fUseWeightFile; + graph->fUseSession = fUseSession; + // add needed blas routines and libs + std::vector blasRoutines; + for (auto & e : graph->fNeededBlasRoutines) + blasRoutines.push_back(e); + AddBlasRoutines(blasRoutines); + for (auto e : graph->fNeededStdLib) + AddNeededStdLib(e); + + // add parent input tensors to current graph + for (auto & name : fInputTensorNames) + graph->fInputTensorNames.emplace_back(name); + + // clean graph name + graph->fName = UTILITY::Clean_name(graph->fName); + +} + +// Function to generate the code for declaring and initializing constant tensors +// This is for tensors which are not part of weight files and can be created from the Constant operator +template +std::string GenerateConstantTensorCode(const std::pair &t) +{ + std::stringstream strs; + std::string type = ConvertTypeToString(t.second.type()); + size_t length = ConvertShapeToLength(t.second.shape()); + // avoid using stack sizes for constant tensors to reduce compilation time + // also for weights which can be broadcasted do not use stack but allocate as a std::vector + bool allocateOnStack = (length > 100 || t.second.IsWeightTensor()) ? false : true; + + const T *data = t.second.data(); + + // and check if all values are the same + bool sameData = false; + + // for non stack allocation check if data are the same + if (!allocateOnStack && length > 1) { + size_t idx = 1; + do { + sameData = (data[idx] == data[idx - 1]); + idx++; + } while (sameData && idx < length); + } + if (allocateOnStack) { + strs << type << " fTensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; + strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ";\n"; + } else { + strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; + if (sameData) + strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; + else { + strs << ConvertValuesToString(length, data) << ";\n"; + } + strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ".data();\n"; + } + return strs.str(); +} + +void RModel::GenerateInitializedTensorInfo() +{ + if (!fInitializedTensors.empty()) + fGC += "// initialized (weights and constant) tensors\n"; + + // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors ) + for (auto &i : fInitializedTensors) { + if (i.second.IsNotWritable()) continue; + size_t length = ConvertShapeToLength(i.second.shape()); + if (!fUseWeightFile || i.second.IsConstantTensor() || !i.second.IsWeightTensor() || i.second.type() != ETensorType::FLOAT ) { + if (i.second.type() == ETensorType::FLOAT) { + // check if NaN of Inf are inside tensor data + bool hasInfOrNaN = false; + const float *data = i.second.data(); + for (size_t idx = 0; idx < length; idx++) { + if (std::is_floating_point::value) { + if (std::isinf(data[idx]) || std::isnan(data[idx])) { + hasInfOrNaN = true; + break; + } + } + } + if (hasInfOrNaN) + AddNeededStdLib("limits"); + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(float); + } else if (i.second.type() == ETensorType::INT64) { + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(int64_t); + } else if (i.second.type() == ETensorType::INT32) { + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(int32_t); + } else if (i.second.type() == ETensorType::BOOL || i.second.type() == ETensorType::UINT8 ) { + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(uint8_t); + } + + + } else { + // case of tensors which are read from a file + if (i.second.type() == ETensorType::FLOAT) { + fGC += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + fGC += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fWeightsTensorSize += length * sizeof(float); + } + } + } +} + +void RModel::GenerateIntermediateMemoryPool() { + if (fIntermediateMemoryInfo.total_stack.empty()) return; + fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n"; + + // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors + // of other data types + auto const &totalStack = fIntermediateMemoryInfo.total_stack; + const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size; + fGC += "std::vector fIntermediateMemoryPool = std::vector(" + std::to_string(memPoolSize) + ");\n\n"; +} + +void RModel::GenerateIntermediateTensorInfo() { + if (!fIntermediateTensorInfos.empty()) { + std::string tensor_declaration_block = ""; + for (auto &i : fIntermediateTensorInfos) { + bool is_alias = (IsAliasTensor(i.first)); + if (i.second.type == ETensorType::BOOL && !is_alias) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; + tensor_declaration_block += "std::uint8_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + continue; + } + bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended); + bool not_in_freq_map = + (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end()); + bool not_in_output_names = + (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()); + + if (((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names) ) && !is_alias) { + size_t length = ConvertShapeToLength(i.second.shape); + + if (i.second.type == ETensorType::FLOAT) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + tensor_declaration_block += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 4 * length; + } + else if (i.second.type == ETensorType::DOUBLE) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + tensor_declaration_block += "double * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; + } + else if (i.second.type == ETensorType::INT64) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + tensor_declaration_block += "int64_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; + } + } + if (is_alias) { + tensor_declaration_block += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n"; + } + + } + + if (tensor_declaration_block.length()) { + fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; + } + } + // add also the dynamic tensors (only declarations, allocation will be done later) + if (!fDynamicTensorInfos.empty()) { + fGC += "//--- declare the dynamic tensors\n"; + for (auto &i : fDynamicTensorInfos) { + fGC += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n"; + } + fGC += "//--- dynamic tensors pool\n"; + fGC += "std::vector fDynamicMemoryPool;\n"; + } +} + +// generate code for specific operator declarations to be defined in the Session class +void RModel::GenerateOperatorDeclarations() { + std::string strcode; + for (auto & op : fOperators) { + strcode += op->GenerateDeclCode(); + } + if (strcode.empty()) return; + fGC += "\n//---- operator declarations \n"; + fGC += strcode; + fGC += "\n"; +} + +void RModel::GenerateDynamicTensorInfo() +{ + // generate code for allocating dynamic tensors using the greedy memory allocations + if (fDynamicTensorInfos.empty()) + return; + + if (fVerbose) { + std::cout << "generating code for dynamic tensor management" << std::endl; + PrintDynamicTensors(); + } + + std::stringstream out; + out << "// dynamic tensor memory management\n"; + out << SP << "std::vector dynamicTensorInfos;\n"; + out << SP << "dynamicTensorInfos.reserve(" << fDynamicTensorInfos.size() << ");\n"; + + // loop on all the operators to find begin/end life of the tensors + int op_index = 0; + std::vector> tensors; + tensors.reserve(fDynamicTensorInfos.size()); + for (auto & op : fOperators) { + // loop on output tensors - + for (auto &it : op->GetOpOutputTensors()) { + if (fVerbose) { + auto op_ptr = op.get(); + std::cout << "Looping on operator " << op_index << " " << typeid(*op_ptr).name() << std::endl; + } + // check if is a dynamic tensor and not an alias tensor or output tensor + std::string name = std::string(it); + if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name) + && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end()) { + auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name)); + auto type = GetTensorType(name); + size_t type_size = GetTypeSize(type); + int begin = op_index; + int end = fOperators.size(); + // look for end + auto it_lookup = fIntermediateTensorFrequencyLookup.find(name); + if (it_lookup != fIntermediateTensorFrequencyLookup.end()) + end = it_lookup->second + 1; // end is last time used + 1 + // // some tensors (like xcol in convolutions) are just used within the operators + // if (end == 0 && begin > 0) end = begin+1; + + if (begin> end) { + std::cout << "op " << op_index << "tensor_" << name << " begin " << begin << " " << " end " << end << std::endl; + throw std::runtime_error("sofie: RModel::GenerateDynamicTensorInfo: tensor_" + name + " has end before begin"); + } + + // write in code + out << SP << "dynamicTensorInfos.push_back( {" << begin << ", " << end << ", " << type_size << "* (" << tensor_size << ") });" + << " // tensor_" << name << std::endl; + tensors.push_back({name,type}); + } + } + op_index++; // increment operator index + } + out << "\n" << SP << "auto memory_result = OrganizeMemory(dynamicTensorInfos);\n\n"; + out << "// allocating now the memory\n"; + out << SP << "fDynamicMemoryPool = std::vector(memory_result.total_bytes);\n"; + out << SP << "int idx = 0;\n"; + for (auto & it : tensors) { + out << SP << "tensor_" << it.first << " = reinterpret_cast<" << ConvertTypeToString(it.second) << " *>(fDynamicMemoryPool.data() + memory_result.offsets[idx++]);\n"; + } + // check that all dynamic tensors are covered + bool missingTensor = false; + for (auto &i : fDynamicTensorInfos) { + if (IsAliasTensor(i.first)) continue; + if (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) != fOutputTensorNames.end()) continue; + if (std::find(tensors.begin(), tensors.end(), std::pair{i.first, i.second.type}) == tensors.end()) { + std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl; + missingTensor = true; + } + } + if (missingTensor) + throw std::runtime_error("sofie: RModel::GenerateDynamicTensorInfo - some tensors are not in input/output list"); + + fGC += out.str(); +} + +/// Check if a given parameter is used for the shape of an input tensor. +bool RModel::IsInputTensorShapeParam(std::string const ¶mName) const +{ + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + if (d.param == paramName) + return true; + } + } + } + return false; +} + +/// Collects all identifiers starting with "tensor_" in the input code, +/// provided that the occurrence is not immediately preceded by a +/// character that is valid in a C++ identifier. Excludes input and output tensor names. +/// Returns a deduplicated std::vector. +std::vector RModel::CollectTensorMemberNames(const std::string &input) +{ + const std::string target = "tensor_"; + + std::vector result; + + for (size_t i = 0; i < input.size();) { + + bool doCollect = false; + + if (i + target.size() <= input.size() && input.compare(i, target.size(), target) == 0 && + (i == 0 || !IsIdentifierChar(input[i - 1]))) { + + doCollect = true; + + std::size_t j = i + target.size(); + + // Extend to full identifier + while (j < input.size() && IsIdentifierChar(input[j])) + ++j; + + std::string fullName = input.substr(i, j - i); + + // Exclude input tensor names + for (std::string const &name : fInputTensorNames) { + if (fullName == target + name) { + doCollect = false; + break; + } + } + + // Exclude output tensor names + if (doCollect) { + for (std::string const &name : fOutputTensorNames) { + if (fullName == target + name) { + doCollect = false; + break; + } + } + } + + if (doCollect) { + result.push_back(fullName); + } + + i = j; // advance past the identifier + } else { + ++i; + } + } + + // Deduplicate (order not preserved) + std::sort(result.begin(), result.end()); + result.erase(std::unique(result.begin(), result.end()), result.end()); + + return result; +} + +std::string RModel::GenerateInferSignature(bool isdecl) { + // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2" + // if (decl = false) generate only calling signature (tensor1,tensor2,....) + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + // if is a dynamic tensor pass initial parameters + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + // need to check if the input parameters is already existing in another input tensor + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + std::string type = ConvertTypeToString(GetTensorType(name)); + if (type == "other") + throw std::runtime_error("sofie: input tensor " + name + + " is of a data type which is not yet supported."); + rGC += type + " const* "; + } + rGC += "tensor_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last "," + return rGC; +} + +namespace { + +std::string typeForOutput(ETensorType t) { + // The std::vector is a special type that is not wrapping continuous memory. + // We don't want to use it as a return type. + if (t == ETensorType::BOOL) t = ETensorType::UINT8; + return ConvertTypeToString(t); +} + +std::string memberNameForDimShape(std::string name) +{ + if (!name.empty()) { + name[0] = std::toupper(static_cast(name[0])); + } + name = "f" + name; + return name; +} + +} + +void RModel::GenerateOutput() +{ + size_t outputSize = fOutputTensorNames.size(); + // assume output types are all the same + + bool sameOutputTypes = true; + std::string inferReturnType; // type return by infer function + ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin()); + fGC += "\n\n"; + if (outputSize == 1) { + fGC += "std::vector<" + typeForOutput(eFirstOutputType) + ">"; + } else { + // if all output types are the same we return an std::vector - otherwise a tuple + for (std::string const &name : fOutputTensorNames) { + if (GetTensorType(name) != eFirstOutputType) + sameOutputTypes = false; + } + if (sameOutputTypes) + fGC += "std::vector>"; + else { + inferReturnType = "std::tuple<"; + for (size_t i = 0; i < outputSize; i++) { + inferReturnType += "std::vector<" + typeForOutput(GetTensorType(fOutputTensorNames[i])) + ">"; + if (i < outputSize - 1) + inferReturnType += ","; + } + inferReturnType += ">"; + fGC += inferReturnType; + } + } + + fGC += " infer(" + GenerateInferSignature() + "){\n"; + + std::string doInferArgs = GenerateInferSignature(false); + if (!doInferArgs.empty()) + doInferArgs += ","; + for (std::string const &name : fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + std::string n; + if(!isDynamic) { + n = std::to_string(ConvertShapeToLength(GetTensorShape(name))); + } else { + std::string dimLen = ConvertDimShapeToLength(GetDynamicTensorShape(name)); + // Use the session member (fXxx) when any dim is a runtime-computed identifier + // (e.g. NonZero count). For expression-type dims derived from input shapes + // (e.g. "((W+-3)/2+1)"), use the expression directly. + // for input shape parameters we don't need to use the session member since it is passed as argument to the infer function and it is not a runtime computed value + bool hasRuntimeParam = false; + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && IsIdentifier(dim.param) && !IsInputTensorShapeParam(dim.param)) + hasRuntimeParam = true; + } + n = hasRuntimeParam ? memberNameForDimShape(dimLen) : dimLen; + } + std::string outputName = "output_tensor_" + name; + fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > " + outputName + "(" + n + ");\n"; + doInferArgs += " " + outputName + ".data(),"; + if(isDynamic) { + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param)) { + fGC += SP + "size_t " + dim.param + " = 0;\n"; + doInferArgs += " " + dim.param + ","; + } + } + } + } + if (!doInferArgs.empty()) + doInferArgs.back() = ' '; + + // verifying if the dynamic parameters are within allowed range + std::unordered_set input_params_checked; + std::string dynamic_parameters_check = ""; + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + if (d.isParam && input_params_checked.count(pName) == 0) { + std::string memberName = memberNameForDimShape(d.param); + dynamic_parameters_check += d.param + " > " + memberName + " || "; + input_params_checked.insert(pName); + fGC += SP + "if (" + d.param + " > " + memberName + ") {\n"; + fGC += SP + SP + "throw std::runtime_error(\"sofie: dynamic input tensor shape parameter " + + d.param + " exceeds the initialized maximum allowed shape.\");\n"; + fGC += SP + "}\n"; + } + } + } + } + + if (fUseSession) { + fGC += SP + "doInfer(*this, " + doInferArgs + ");\n"; + } else { + fGC += SP + "doInfer(" + doInferArgs + ");\n"; + } + + // If the output tensors have dynamic sizes, now is the time to set them + for (std::string const &name : fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + if (isDynamic) { + std::string outputName = "output_tensor_" + name; + auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name)); + fGC += SP + outputName + ".resize(" + tensor_size + ");\n"; + } + } + + fGC += SP + "return {"; + for (size_t i = 0; i < fOutputTensorNames.size(); i++) { + fGC += "output_tensor_" + fOutputTensorNames[i]; + if (i < fOutputTensorNames.size() - 1) + fGC += ","; + } + fGC += "};\n"; + fGC += "}\n"; // end of infer function scope +} + +void RModel::GenerateSessionCode() +{ + std::string sessionName = !fIsSubGraph ? "Session" : "Session_" + fName; + + if (fUseSession && !fIsGNNComponent) { + // forward declare session struct + fGC += "struct " + sessionName + ";\n"; + } + + // Determine the signature of the actual inference function + std::string doInferSignature = GenerateInferSignature(); + if (!doInferSignature.empty()) + doInferSignature += ", "; + for (auto const &name : fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + doInferSignature += typeForOutput(GetTensorType(name)) + " *tensor_" + name + ","; + if(isDynamic) { + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param)) + doInferSignature += " size_t &" + dim.param + "_output,"; + } + } + } + doInferSignature.back() = ' '; + + if (fUseSession) { + doInferSignature = sessionName + " const &session, " + doInferSignature; + } + + doInferSignature = "inline void doInfer(" + doInferSignature + ")"; + + if (!fIsGNNComponent) { + // forward declare inference implementation + fGC += doInferSignature + ";\n"; + } + + // define the Session struct (for GNN this is generated in RModel_GNN) + if (fUseSession && !fIsGNNComponent) { + fGC += "struct " + sessionName + " {\n"; + } + + // generate code for declaring the initialized tensors + GenerateInitializedTensorInfo(); + + if (fOptimizationLevel == OptimizationLevel::kExtended) { + // evaluate total intermediate memory and position intermediate tensor addresses + std::string intermediate_memory_alloc_string = ""; + intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) { + auto op = fOperators[op_idx].get(); + std::cout << "\n******************\n analyzing input/output operator " << op_idx << " " + << typeid(*op).name() << std::endl; + } + intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); + CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); + } + + // to check remaining unused fragments after memory allocation (lesser the better) + // for (const auto &it: fIntermediateMemoryInfo.available_stack){ + // std::cout<<"chunk_idx: "<fName + " fSession_" + graph->fName + ";\n"; + } + + // Generate code for Session constructor + if (fUseSession) { + // add here specific operator code that needs to define session data members + fGC += "\n"; + for (size_t id = 0; id < fOperators.size(); id++) { + std::string opName = std::to_string(id); + fGC += fOperators[id]->GenerateSessionMembersCode(opName); + } + fGC += "\n"; + // here add initialization and reading of weight tensors + if (fUseWeightFile) { + std::string fileName = fName; + if (fWeightFile == WeightFileType::Text) { + fileName += ".dat"; + } + if (fWeightFile == WeightFileType::RootBinary) { + fileName += ".root"; + } + fGC += sessionName + "(std::string filename =\"" + fileName + "\""; + } else { + // no need to pass weight file since it is not used + // keep passing a string for compatibility + fGC += sessionName + "(std::string = \"\""; + } + // add initialization of shape parameters + // assume all parameters are of type size_t + if (!fDimShapeNames.empty()) { + // need to use same order as in infer function not alphabetical one + for (auto &p : fDimShapeNames) { + fGC += ",\n"; + fGC += " size_t " + p + " = " + fShapeParams[p]; + } + } + fGC += ") {\n"; + + // initializing dynamic parameters + if (!fDimShapeNames.empty()) { + fGC += "\n\n"; + std::sort(fDimShapeNames.begin(), fDimShapeNames.end()); + for (const auto &p : fDimShapeNames) { + fGC += " " + memberNameForDimShape(p) + " = " + p + ";\n"; + } + } + // add some extra code needed for initialization of dynamic parameters + fGC += fExtraCodeForDimShapes; + + if (fUseWeightFile) { + fGC += "\n//--- reading weights from file\n"; + ReadInitializedTensorsFromFile(fReadPos); + fGC += "\n"; + // fUseWeightFile = fUseWeightFile; + } + + // now we have passed the parameters we can allocate the dynamic tensors + GenerateDynamicTensorInfo(); + + // add here initialization code for operator + for (size_t id = 0; id < fOperators.size(); id++) { + fGC += fOperators[id]->GenerateInitCode(); + } + + fGC += "}\n\n"; + } + + // generate the inference overload that returns an output struct + GenerateOutput(); + + // end of session + if (fUseSession && !fIsGNNComponent) { + fGC += "}; // end of Session\n\n"; + + GenerateRequiredInputTensorInfo(); + } + + fGC += doInferSignature + " {\n"; + fGC += "\n"; + + // generate the inference code + if (fVerbose) + std::cout << "Generating main inference code for " << fName << std::endl; + + if (fOutputTensorNames.size() == 0) + throw std::runtime_error("sofie: output size=0 are not supported"); + + std::string allOperatorCode; + + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) + std::cout << "Generating code for operator .... " << op_idx << std::endl; + std::string operatorCode = fOperators[op_idx]->Generate(std::to_string(op_idx)); + allOperatorCode += operatorCode; + } + + // If the generated code users members of the session struct, use the + // local variable name that we're using for the session: + ReplaceAll(allOperatorCode, "this->", "session."); + + if (fUseSession && !fIsGNNComponent) { + // Collect all "tensor_*" data members that are not input or output tensors + std::vector tensorMemberNames = CollectTensorMemberNames(allOperatorCode); + for (auto const& name: tensorMemberNames) { + fGC += " auto &" + name + " = session." + name + ";\n"; + } + fGC += "\n"; + } + + fGC += allOperatorCode; + + for (auto const& name: fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + if(isDynamic) { + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param)) + fGC += " " + dim.param + "_output = " + dim.param + ";\n"; + } + } + if(IsConstantTensor(name)) { + std::string t = "session.tensor_" + name; + size_t length = ConvertShapeToLength(fInitializedTensors[name].shape()); + fGC += " std::copy(" + t + ", " + t + " + " + std::to_string(length) + ", tensor_" + name + ");\n"; + } + } + fGC += "\n"; + + fGC += "}\n"; +} + +void RModel::Generate(std::underlying_type_t options, int batchSize, long pos, bool verbose) +{ + fVerbose = verbose; + fBatchSize = batchSize; + fReadPos = pos; + + // session flag is used in operator initialize + if (static_cast>(Options::kNoSession) & options) { + fUseSession = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kNoWeightFile) & options) { + fUseWeightFile = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kRootBinaryWeightFile) & options) { + fUseWeightFile = true; + fWeightFile = WeightFileType::RootBinary; + } + if (fUseWeightFile && !fUseSession) { + throw std::runtime_error( + "sofie: RModel::Generate: cannot use a separate weight file without generating a Session class"); + } + + if (static_cast>(Options::kGNN) & options) + fIsGNN = true; + if (static_cast>(Options::kGNNComponent) & options) + fIsGNNComponent = true; + + // initialize the model including all operators and sub-graphs + Initialize(batchSize, verbose); + + // if having dynamic tensor we need to have a Session + if (!fDynamicTensorInfos.empty()) { + fUseSession = true; + if (verbose) + std::cout << "Warning: Force having a Session since model has dynamic tensors " << std::endl; + } + + std::string hgname; + if (!fIsGNNComponent && !fIsSubGraph) { + fGC.clear(); + GenerateHeaderInfo(hgname); + } + + // generate first code for the subgraphs + for (auto &graph : fSubGraphs) { + if (fVerbose) + std::cout << "generate session code for subgraph " << graph->fName << std::endl; + graph->GenerateSessionCode(); + fGC += graph->fGC; + } + + if (fVerbose) + std::cout << "generate Main session code - model " << fName << std::endl; + + // generate main session code + GenerateSessionCode(); + + if (!fIsGNNComponent && !fIsSubGraph) { + fGC += ("} //TMVA_SOFIE_" + fName + "\n"); + fGC += "\n#endif // " + hgname + "\n"; + } +} + +void RModel::ReadInitializedTensorsFromFile(long pos) { + // generate the code to read initialized tensors from a text data file + if (fWeightFile == WeightFileType::Text) { + // check if there are tensors to write + + if (!fUseWeightFile) return; + + fGC += " std::ifstream f;\n"; + fGC += " f.open(filename);\n"; + fGC += " if (!f.is_open()) {\n"; + fGC += " throw std::runtime_error(\"sofie failed to open file \" + filename + \" for input weights\");\n"; + fGC += " }\n"; + + if(fIsGNNComponent) { + fGC += " f.seekg(" + std::to_string(pos) + ");\n"; + } + + fGC += " using SOFIE::ReadTensorFromStream;\n"; + + // loop on tensors and parse the file + for (auto& i: fInitializedTensors) { + // skip Constant and shape tensors (not written in a file) + if (!i.second.IsWeightTensor()) continue; + std::string tensor_name = "tensor_" + i.first; + if (i.second.type() == ETensorType::FLOAT) { + std::string length = std::to_string(ConvertShapeToLength(i.second.shape())); + fGC += " ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n"; + } else { + throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); + } + } + fGC += " f.close();\n"; + } + + // generate the code to read initialized tensors from a ROOT data file + if(fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY + fGC += " {\n"; + fGC += " std::unique_ptr rootFile(TFile::Open(filename.c_str(), \"READ\"));\n"; + fGC += " if (!rootFile->IsOpen()) {\n"; + fGC += " throw std::runtime_error(\"sofie failed to open ROOT file for input weights\");\n"; + fGC += " }\n"; + + std::string dirName = fName + "_weights"; + fGC += " if (!rootFile->GetKey(\"" + dirName + "\")) {\n"; + fGC += " throw std::runtime_error(\"sofie failed to open ROOT directory for input weights\");\n"; + fGC += " }\n"; + + for (auto &i : fInitializedTensors) { + // skip Constant and shape tensors + if (!i.second.IsWeightTensor()) continue; + fGC += " {\n"; + std::string tensor_name = "tensor_" + i.first; + if (i.second.type() == ETensorType::FLOAT) { + fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; + fGC += dirName + "/" + tensor_name + "\"));\n"; + } else if (i.second.type() == ETensorType::DOUBLE) { + fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; + fGC += dirName + + "/" + tensor_name + "\"));\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; + fGC += dirName + "/" + tensor_name + "\"));\n"; + } else { + throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file"); + } + fGC += " }\n"; + } + fGC += " }\n"; +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY + } +} + +long RModel::WriteInitializedTensorsToFile(std::string filename) { + // Determine the file extension based on the weight file type + std::string fileExtension; + switch (fWeightFile) { + case WeightFileType::None: + fileExtension = ".dat"; + break; + case WeightFileType::RootBinary: + fileExtension = ".root"; + break; + case WeightFileType::Text: + fileExtension = ".dat"; + break; + } + + // If filename is empty, use the model name as the base filename + if (filename.empty()) { + filename = fFileName + fileExtension; + } + + // Write the initialized tensors to the file + if (fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY + if(fIsGNNComponent || fIsGNN) { + throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file."); + } + std::unique_ptr outputFile(TFile::Open(filename.c_str(), "UPDATE")); + + std::string dirName = fName + "_weights"; + // check if directory exists, in case delete to replace with new one + if (outputFile->GetKey(dirName.c_str())) + outputFile->rmdir(dirName.c_str()); + + auto outputDir = outputFile->mkdir(dirName.c_str()); + + for (const auto& item : fInitializedTensors) { + // skip Constant tensors and tensors which are not writable (e.g. shape tensors) + if (!item.second.IsWeightTensor()) continue; + std::string tensorName = "tensor_" + item.first; + size_t length = 1; + length = ConvertShapeToLength(item.second.shape()); + if(item.second.type() == ETensorType::FLOAT) { + const float* data = item.second.data(); + std::vector tensorDataVector(data, data + length); + outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); + } + else if(item.second.type() == ETensorType::DOUBLE) { + const double* data = item.second.data(); + std::vector tensorDataVector(data, data + length); + outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); + } + else if(item.second.type() == ETensorType::INT64) { + const int64_t* data = item.second.data(); + std::vector tensorDataVector(data, data + length); + outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); + } + else { + throw std::runtime_error("sofie tensor " + tensorName + " with type " + ConvertTypeToString(item.second.type()) + + " cannot be written to a ROOT file"); + } + } + outputFile->Write(filename.c_str()); + + // this needs to be changed, similar to the text file + return -1; + +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY + } else if (fWeightFile == WeightFileType::Text) { + std::ofstream f; + if(fIsGNNComponent) { + // appending all GNN components into the same file + f.open(filename, std::ios::app); + } else { + f.open(filename); + } + if (!f.is_open()) + throw + std::runtime_error("sofie failed to open file " + filename + " for tensor weight data"); + for (auto& i: fInitializedTensors) { + // skip Constant tensors and not writable tensors (e.g. shape tensors) + if (!i.second.IsWeightTensor()) { + continue; + } + size_t length = ConvertShapeToLength(i.second.shape()); + std::string tensor_name = "tensor_" + i.first; + f << tensor_name << " " << length << "\n"; + if (i.second.type() == ETensorType::FLOAT) { + const float * data = i.second.data(); + for (size_t idx = 0; idx < length; idx++) { + // round to zero sub-normal values + float value = data[idx]; + if (value != 0. && std::abs(value) < std::numeric_limits::min() ) value = 0; + // handle non-finite values explicitly + if (std::isinf(value)) + f << (value > 0 ? "inf" : "-inf"); + else if (std::isnan(value)) + f << "nan"; + else + f << std::setprecision(std::numeric_limits::max_digits10) << value; + f << ( (idx < length-1) ? " " : "\n" ); + } + } + else { + throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file"); + } + if (f.fail()) + throw std::runtime_error("sofie failed to write tensor data to file for " + tensor_name); + } + long curr_pos = f.tellp(); + f.close(); + return curr_pos; + } else { + return -1; + } +} + +void RModel::PrintSummary() const { + std::cout << "Summary of model " << GetName() << std::endl; + for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){ + auto& r = *fOperators[op_idx].get(); + std::string raw_name = typeid(r).name(); + // look for ROperator_NAME + std::string name = raw_name.substr(raw_name.find("ROperator_")+10, raw_name.size()); + std::cout << op_idx << " " << name << " : "; + for (auto & t_in : r.GetOpInputTensors()) std::cout << t_in << " "; + std::cout << " ----> "; + for (auto & t_out : r.GetOpOutputTensors()) std::cout << t_out << " "; + std::cout << std::endl; + } +} + +/// To emit the dimensions of the input tensors as a data member of a session, +/// which is helpful when validating the inference inputs. +void RModel::GenerateRequiredInputTensorInfo() +{ + fGC += "\n// Input tensor dimensions\n"; + fGC += "using SOFIE::SingleDim;\n"; + fGC += "using SOFIE::TensorDims;\n"; + fGC += "using SOFIE::makeDims;\n\n"; + bool hasDynamicInputTensors = false; + + for (std::size_t iInput = 0; iInput < fInputTensorNames.size(); ++iInput) { + auto const &name = fInputTensorNames[iInput]; + if (IsDimInputTensor(name)) { + hasDynamicInputTensors = true; + } + std::vector shape = GetDimTensorShape(name); + fGC += "constexpr std::array dim_" + name + "{"; + for (std::size_t iDim = 0; iDim < shape.size(); ++iDim) { + auto const &dim = shape[iDim]; + if (dim.isParam) { + fGC += "SingleDim{\"" + dim.GetVal() + "\"}"; + } else { + fGC += "SingleDim{" + dim.GetVal() + "}"; + } + if (iDim != shape.size() - 1) { + fGC += ", "; + } + } + fGC += "};\n"; + } + fGC += "\nconstexpr std::array inputTensorDims{\n"; + for (std::size_t iInput = 0; iInput < fInputTensorNames.size(); ++iInput) { + auto const &name = fInputTensorNames[iInput]; + fGC += SP + "makeDims(dim_" + name + ")"; + if (iInput == fInputTensorNames.size() - 1) { + fGC += "\n"; + } else { + fGC += ",\n"; + } + } + fGC += "};\n"; + + fGC += + "\nconstexpr bool hasDynamicInputTensors{" + std::string{hasDynamicInputTensors ? "true" : "false"} + "};\n\n"; + + fGC += "\n// Output tensor dimensions\n"; + bool hasDynamicOutputTensors = false; + for (std::size_t iOutput = 0; iOutput < fOutputTensorNames.size(); ++iOutput) { + auto const &name = fOutputTensorNames[iOutput]; + if (IsDynamicTensor(name)) { + hasDynamicOutputTensors = true; + } + std::vector shape = GetDimTensorShape(name); + fGC += "constexpr std::array dim_" + name + "{"; + for (std::size_t iDim = 0; iDim < shape.size(); ++iDim) { + auto const &dim = shape[iDim]; + if (dim.isParam) { + fGC += "SingleDim{\"" + dim.GetVal() + "\"}"; + } else { + fGC += "SingleDim{" + dim.GetVal() + "}"; + } + if (iDim != shape.size() - 1) { + fGC += ", "; + } + } + fGC += "};\n"; + } + fGC += "\nconstexpr std::array outputTensorDims{\n"; + for (std::size_t iOutput = 0; iOutput < fOutputTensorNames.size(); ++iOutput) { + auto const &name = fOutputTensorNames[iOutput]; + fGC += SP + "makeDims(dim_" + name + ")"; + if (iOutput == fOutputTensorNames.size() - 1) { + fGC += "\n"; + } else { + fGC += ",\n"; + } + } + fGC += "};\n"; + fGC += + "\nconstexpr bool hasDynamicOutputTensors{" + std::string{hasDynamicOutputTensors ? "true" : "false"} + "};\n\n"; +} + +void RModel::PrintRequiredInputTensors() const { + std::cout << "Model requires following inputs:\n"; + for (auto& inputInfo: fInputTensorInfos) { + std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t"; + std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { + if (inputInfo.second.shape[i].isParam) { + std::cout << inputInfo.second.shape[i].param; + } else { + std::cout << inputInfo.second.shape[i].dim ; + } + if (i < inputInfo.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + + for (auto& inputInfo: fReadyInputTensorInfos) { + std::cout << "Fully Specified Tensor name: " << inputInfo.first << "\t"; + std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { + std::cout << inputInfo.second.shape[i]; + if (i < inputInfo.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintInitializedTensors() const { + std::cout << "Model initialized the following tensors:\n"; + for (auto& it: fInitializedTensors) { + std::cout << "Tensor name: \"" << it.first << "\"\t"; + std::cout << "type: " << ConvertTypeToString(it.second.type()) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < it.second.shape().size(); i++) { + std::cout << it.second.shape()[i]; + if (i < it.second.shape().size() - 1) std::cout << ","; + } + std::cout << "]"; + if (it.second.IsConstantTensor()) std::cout << " (Constant)"; + if (it.second.IsNotWritable()) std::cout << " (Not Writable)"; + std::cout << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintIntermediateTensors() const { + std::cout << "Model specify the following intermediate tensors:\n"; + for (auto& it: fIntermediateTensorInfos) { + std::cout << "Tensor name: \"" << it.first << "\"\t"; + std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < it.second.shape.size(); i++) { + std::cout << it.second.shape[i]; + if (i < it.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintDynamicTensors() const { + std::cout << "Model specify the following dynamic tensors:\n"; + for (auto& it: fDynamicTensorInfos) { + std::cout << "Tensor name: \"" << it.first << "\"\t"; + std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < it.second.shape.size(); i++) { + std::cout << it.second.shape[i].GetVal(); + if (i < it.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintOutputTensors() const { + std::cout << "Model specify the following output tensors:\n"; + for (auto& it: fOutputTensorNames) { + std::cout << "Tensor name: \"" << it << "\"\t"; + try { + auto shape = GetDimTensorShape(it); + std::cout << "with shape: " << ConvertDimShapeToString(shape) << std::endl; + } catch (...) { + std::cout << "with shape not yet defined" << std::endl; + } + } + std::cout << "\n"; +} + +void RModel::HeadInitializedTensors(std::string name, int n_print) { + auto it = fInitializedTensors.find(name); + if (it == fInitializedTensors.end()) { + std::cout << "Tensor " << name << " not found in model's initialized tensor list" << std::endl; + return; + } + + std::cout << "Tensor name: " << it->first << "\t"; + std::cout << "type: " << ConvertTypeToString(it->second.type()) << "\t"; + int length =1; + std::cout << "shape: ["; + for (size_t i = 0; i < it->second.shape().size(); i++) { + std::cout << it->second.shape()[i]; + length *= it->second.shape()[i]; + if (i < it->second.shape().size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + bool ellipsis = true; + if (n_print > length) { + n_print = length; + ellipsis = false; + } + + std::cout << "data: [" << std::endl; + if (it->second.type() == ETensorType::FLOAT) { + auto converted_data = it->second.data(); + for (int i =0; i < n_print; i++) { + std::cout << converted_data[i]; + if (i < n_print - 1) std::cout << " ,"; + } + } + if (ellipsis) std::cout << ", ..."; + std::cout << "]" << std::endl; + +} + +void RModel::OutputGenerated(std::string filename, bool append) { + + RModel_Base::OutputGenerated(filename, append); + + // write weights in a text file + if (fUseWeightFile) { + if (!filename.empty()) { + size_t pos = filename.find(".hxx"); + if (fWeightFile == WeightFileType::Text) + filename.replace(pos, 4, ".dat"); + if (fWeightFile == WeightFileType::RootBinary) { + filename = filename.erase(pos, 4); + filename += ".root"; + } + } else { + filename = fName; + filename += fWeightFile == WeightFileType::Text ? ".dat" : ".root"; + } + WriteInitializedTensorsToFile(filename); + } +} + +#ifdef SOFIE_SUPPORT_ROOT_BINARY +void RModel::Streamer(TBuffer &R__b) { + if (R__b.IsReading()) { + RModel::Class()->ReadBuffer(R__b, this); + for (auto & i : fInitializedTensors) { + i.second.CastPersistentToShared(); + } + } + else { + for (auto & i : fInitializedTensors) { + i.second.CastSharedToPersistent(); + } + RModel::Class()->WriteBuffer(R__b, this); + } +} +#endif + +}//SOFIE diff --git a/core/src/RModel_ALPAKA.cxx b/core/src/RModel_ALPAKA.cxx new file mode 100644 index 0000000..621b701 --- /dev/null +++ b/core/src/RModel_ALPAKA.cxx @@ -0,0 +1,747 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifdef SOFIE_SUPPORT_ROOT_BINARY +#include "TFile.h" +#endif + +#include "SOFIE/RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" + +namespace SOFIE { + +void RModel::ComputeEltwiseFusionGroups() { + fEltwiseFusionGroups.clear(); + fOpToFusionGroupIdx.clear(); + fFusionIntermediateTensors.clear(); + + // Build tensor -> consumer op indices map + std::unordered_map> tensorConsumers; + for (size_t i = 0; i < fOperators.size(); i++) { + for (const auto& name : fOperators[i]->GetOpInputTensors()) + tensorConsumers[std::string(name)].push_back(i); + } + + // Returns true if tensorName is safe to treat as a fusion intermediate: + // consumed by exactly one op AND not a model output. + auto isFuseSafe = [&](const std::string& tensorName) -> bool { + for (const auto& outName : fOutputTensorNames) + if (outName == tensorName) return false; + auto it = tensorConsumers.find(tensorName); + return it != tensorConsumers.end() && it->second.size() == 1; + }; + + std::vector opAssigned(fOperators.size(), false); + + for (size_t i = 0; i < fOperators.size(); i++) { + if (opAssigned[i]) continue; + opAssigned[i] = true; + + EltwiseFusionGroup group; + group.opIndices.push_back(i); + + auto firstInputs = fOperators[i]->GetOpInputTensors(); + group.inputTensor = firstInputs.empty() ? "" : std::string(firstInputs[0]); + + // Extend chain: only if CURRENT op is elementwise and its single output can be fused + size_t current = i; + while (fOperators[current]->IsElementwise()) { + auto curOutputs = fOperators[current]->GetOpOutputTensors(); + if (curOutputs.size() != 1) break; + std::string curOut = std::string(curOutputs[0]); + if (!isFuseSafe(curOut)) break; + + size_t nextIdx = tensorConsumers.find(curOut)->second[0]; + // Must be strictly the next op in sequence and itself elementwise with single input + if (nextIdx != current + 1) break; + if (opAssigned[nextIdx]) break; + if (!fOperators[nextIdx]->IsElementwise()) break; + auto nextInputs = fOperators[nextIdx]->GetOpInputTensors(); + if (nextInputs.size() != 1) break; + + opAssigned[nextIdx] = true; + group.opIndices.push_back(nextIdx); + current = nextIdx; + } + + // Output tensor is the last op's output + auto lastOutputs = fOperators[current]->GetOpOutputTensors(); + group.outputTensor = lastOutputs.empty() ? "" : std::string(lastOutputs[0]); + + // Element count from intermediate tensor info (all op outputs are intermediates) + if (!group.outputTensor.empty()) { + auto it = fIntermediateTensorInfos.find(group.outputTensor); + if (it != fIntermediateTensorInfos.end()) + group.numElements = ConvertShapeToLength(it->second.shape); + } + + size_t gIdx = fEltwiseFusionGroups.size(); + for (auto opIdx : group.opIndices) + fOpToFusionGroupIdx[opIdx] = gIdx; + + // Mark all-but-last outputs as fusion intermediates (skip allocation) + if (group.isFused()) { + for (size_t k = 0; k + 1 < group.opIndices.size(); k++) { + auto midOuts = fOperators[group.opIndices[k]]->GetOpOutputTensors(); + if (!midOuts.empty()) + fFusionIntermediateTensors.insert(std::string(midOuts[0])); + } + } + + fEltwiseFusionGroups.push_back(std::move(group)); + } +} + +void RModel::GenerateInitializedTensorInfo_GPU_ALPAKA() { + if (!fInitializedTensors.empty()){ + fGC += "\n// initialized tensors for weights\n"; + } + + for (auto &i : fInitializedTensors) { + if (!fUseWeightFile || i.second.IsConstantTensor()) { + if (i.second.type() == ETensorType::FLOAT) + fGC += GenerateConstantTensorCode(i); + else if (i.second.type() == ETensorType::INT64) + fGC += GenerateConstantTensorCode(i); + + } + // case of tensors which are read from a file + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += "BufI641D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + + } +} + +void RModel::GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA() +{ + if (!fInitializedTensors.empty()) + fGC += "// temporary initialized tensors for loading weights\n"; + + for (auto &i : fInitializedTensors) { + if (fUseWeightFile && !i.second.IsConstantTensor()) { + // case of tensors which are read from a file + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "std::vector tensor_" + i.first + "(" + std::to_string(length) + ");\n"; + } + } + } +} + +void RModel::GenerateGPU_ALPAKA_Buffers() { + if (!fIntermediateTensorInfos.empty()) { + std::string tensor_declaration_block = ""; + + for (auto &i : fIntermediateTensorInfos) { + // Skip tensors that are purely intermediate within a fused kernel chain + if (fFusionIntermediateTensors.count(i.first)) continue; + + size_t length = ConvertShapeToLength(i.second.shape); + + if (i.second.type == ETensorType::FLOAT) { + tensor_declaration_block += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + tensor_declaration_block += "BufD1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::INT64) { + tensor_declaration_block += "BufI641D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::BOOL) { + tensor_declaration_block += "BufUI81D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + } + + if (tensor_declaration_block.length()) { + fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; + } + } + + // add also the dynamic tensors (only declarations, allocation will be done later) + if (!fDynamicTensorInfos.empty()) { + fGC += "//--- declare the dynamic tensors\n"; + fGC += "using bufDev_float = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_double = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_int64 = alpaka::Buf, size_t>;\n"; + + for (auto &i : fDynamicTensorInfos) { + if (i.second.type == ETensorType::FLOAT) { + fGC += "bufDev_float bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + fGC += "bufDev_double bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::INT64) { + fGC += "bufDev_int64 bufDev_" + i.first + ";\n"; + } + } + } +} + +void RModel::GenerateDynamicTensorInfo_GPU_ALPAKA() { + fGC += "//---- allocate the intermediate dynamic tensors\n"; + std::stringstream out; + + for (auto &i : fDynamicTensorInfos) { + auto length = ConvertDimShapeToLength(i.second.shape); + out << SP << "if (" << length << " > 0) {\n"; + out << "auto bufDev_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" << length << "}));\n"; + out << SP << "}\n"; + } + fGC += out.str(); +} + +std::string RModel::GenerateInferSignature_GPU_ALPAKA(bool isdecl) { + // generate the infer signature given the inputs: eg. "BufF1D const deviceBuf_A, BufF1D const deviceBuf_B" + // if (isdecl = false) generate only calling signature (deviceBuf_A, deviceBuf_B, ....) + + auto GetBufType = [this](const std::string& name) -> std::string { + ETensorType type = GetTensorType(name); + if (type == ETensorType::FLOAT) return "BufF1D"; + if (type == ETensorType::DOUBLE) return "BufD1D"; + if (type == ETensorType::INT64) return "BufI641D"; + if (type == ETensorType::BOOL) return "BufUI81D"; + throw std::runtime_error("sofie: input tensor " + name + + " is of a data type which is not yet supported."); + }; + + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + // if is a dynamic tensor pass initial parameters + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + rGC += GetBufType(name) + " const "; + } + rGC += "deviceBuf_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back(); // remove last "," + return rGC; +} + +std::string RModel::GenerateImplSignature_GPU_ALPAKA(bool isdecl) { + // Like GenerateInferSignature_GPU_ALPAKA but uses ViewPlainPtr const& instead of Buf const. + // This lets _infer_impl accept non-owning views so both the typed and span-based infer + // wrappers can call it without duplication of operator code. + + auto GetViewConstType = [this](const std::string& name) -> std::string { + ETensorType type = GetTensorType(name); + if (type == ETensorType::FLOAT) return "ViewConstF1D"; + if (type == ETensorType::DOUBLE) return "ViewConstD1D"; + if (type == ETensorType::INT64) return "ViewConstI641D"; + if (type == ETensorType::BOOL) return "ViewConstUI81D"; + throw std::runtime_error("sofie: input tensor " + name + + " is of a data type which is not yet supported."); + }; + + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + rGC += GetViewConstType(name) + " const& "; + } + rGC += "deviceBuf_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back(); + return rGC; +} + +void RModel::GenerateOutput_GPU_ALPAKA() { + if (fVerbose) + std::cout << "Generating main inference code for " << fName << std::endl; + + size_t outputSize = fOutputTensorNames.size(); + if (outputSize == 0) + throw std::runtime_error("sofie: output size=0 are not supported"); + + ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin()); + bool sameOutputTypes = true; + for (std::string const &name : fOutputTensorNames) { + if (GetTensorType(name) != eFirstOutputType) + sameOutputTypes = false; + } + + auto GetViewConstType = [this](const std::string &name) -> std::string { + ETensorType type = GetTensorType(name); + if (type == ETensorType::FLOAT) return "ViewConstF1D"; + if (type == ETensorType::DOUBLE) return "ViewConstD1D"; + if (type == ETensorType::INT64) return "ViewConstI641D"; + if (type == ETensorType::BOOL) return "ViewConstUI81D"; + throw std::runtime_error("sofie: input tensor " + name + " is of an unsupported data type."); + }; + + // Collect deduplicated dynamic dimension parameter names in declaration order + std::vector dynParamNames; + { + std::unordered_map seen; + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + if (d.isParam && seen.count(d.param) == 0) { + dynParamNames.push_back(d.param); + seen[d.param] = 1; + } + } + } + } + } + + fGC += "\n\n"; + + // === 1. _infer_impl: all operator code, takes ViewPlainPtr const& for inputs === + fGC += "void _infer_impl("; + fGC += GenerateImplSignature_GPU_ALPAKA(); + fGC += "){\n"; + + std::set fusedGroupsLaunched; + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) + std::cout << "Generating code for operator .... " << op_idx << std::endl; + + auto gIt = fOpToFusionGroupIdx.find(op_idx); + size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX; + bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused(); + + if (inFusedGroup) { + // Only emit the fused kernel launch once, at the chain leader + if (fEltwiseFusionGroups[gIdx].opIndices[0] == op_idx && !fusedGroupsLaunched.count(gIdx)) { + const auto& grp = fEltwiseFusionGroups[gIdx]; + std::string sfx = grp.suffix(); + std::string kname = "fusedEltwiseKernel" + sfx; + fGC += "\n//------ FUSED_ELTWISE_GPU_ALPAKA" + sfx + "\n"; + fGC += SP + "{\n"; + fGC += SP + SP + "auto const elementsPerThread_fused" + sfx + " = Vec::all(static_cast(1));\n"; + fGC += SP + SP + "auto const elementsPerGrid_fused" + sfx + " = Vec::all(Idx{" + std::to_string(grp.numElements) + "});\n"; + fGC += SP + SP + "auto const workDiv_fused" + sfx + " = sofie_workdiv(elementsPerGrid_fused" + sfx + ");\n"; + fGC += SP + SP + "auto task_fused" + sfx + " = alpaka::createTaskKernel(workDiv_fused" + sfx + ", " + kname + + ", alpaka::getPtrNative(deviceBuf_" + grp.inputTensor + "), alpaka::getPtrNative(deviceBuf_" + grp.outputTensor + + "), static_cast(" + std::to_string(grp.numElements) + "));\n"; + fGC += SP + SP + "alpaka::enqueue(queue, task_fused" + sfx + ");\n"; + fGC += SP + "}\n"; + fusedGroupsLaunched.insert(gIdx); + } + // Chain followers: skip — their logic is inside the fused kernel + } else { + fGC += fOperators[op_idx]->Generate_GPU_ALPAKA(std::to_string(op_idx)); + } + } + fGC += "\n\n alpaka::wait(queue);\n"; + fGC += "}\n\n"; + + // === 2. Span-based infer: generic entry point === + // Dynamic params are forwarded explicitly; non-float inputs not yet supported here. + std::string spanDynDecl; + for (auto &p : dynParamNames) + spanDynDecl += ", size_t " + p; + + fGC += "__host__ void infer(std::span inputs, std::span outputs" + spanDynDecl + "){\n"; + + // Build _infer_impl call: dyn params first, then inputs[i] + { + fGC += SP + "_infer_impl("; + bool first = true; + for (auto &p : dynParamNames) { + if (!first) fGC += ", "; + fGC += p; + first = false; + } + for (size_t i = 0; i < fInputTensorNames.size(); i++) { + if (!first) fGC += ", "; + fGC += "inputs[" + std::to_string(i) + "]"; + first = false; + } + fGC += ");\n"; + } + + // Copy member output buffers into caller-provided output views + for (size_t i = 0; i < outputSize; i++) { + std::string tensorName = *(fOutputTensorNames.begin() + i); + fGC += SP + "alpaka::memcpy(queue, outputs[" + std::to_string(i) + "], deviceBuf_" + tensorName + ");\n"; + } + fGC += SP + "alpaka::wait(queue);\n"; + fGC += "}\n\n"; + + // === 3. Typed infer: backward-compatible wrapper that delegates to _infer_impl === + // Build return type + std::string returnType; + if (outputSize == 1) { + returnType = "alpaka::Buf"; + } else if (sameOutputTypes) { + returnType = "std::array, " + std::to_string(outputSize) + ">"; + } else { + returnType = "std::tuple<"; + for (size_t i = 0; i < outputSize; i++) { + std::string tname = *(fOutputTensorNames.begin() + i); + returnType += "alpaka::Buf"; + if (i < outputSize - 1) returnType += ","; + } + returnType += ">"; + } + + fGC += "__host__ " + returnType + " infer("; + fGC += GenerateInferSignature_GPU_ALPAKA(); + fGC += "){\n"; + + // Wrap each typed input buffer in a ViewConstXX, then call _infer_impl + std::vector typedImplArgs; + for (auto &p : dynParamNames) + typedImplArgs.push_back(p); + for (auto &name : fInputTensorNames) { + std::string viewType = GetViewConstType(name); + fGC += SP + viewType + " const view_" + name + + "{alpaka::getPtrNative(deviceBuf_" + name + "), devAcc, alpaka::getExtents(deviceBuf_" + name + ")};\n"; + typedImplArgs.push_back("view_" + name); + } + + fGC += SP + "_infer_impl("; + for (size_t i = 0; i < typedImplArgs.size(); i++) { + if (i > 0) fGC += ", "; + fGC += typedImplArgs[i]; + } + fGC += ");\n"; + + // Return the member output buffer(s) + fGC += SP + "return "; + if (outputSize > 1) fGC += "{"; + for (size_t i = 0; i < outputSize; i++) { + std::string tensorName = *(fOutputTensorNames.begin() + i); + fGC += "deviceBuf_" + tensorName; + if (i < outputSize - 1) fGC += ","; + } + if (outputSize > 1) fGC += "}"; + fGC += ";\n"; + fGC += "}\n"; +} + +void RModel::GenerateSessionCode_GPU_ALPAKA() { + + std::set registered_operators; + std::set fusedGroupsEmitted; // tracks which fusion groups have had their struct/decl emitted + std::set single_initialized_operators = { + SOFIE::OperatorKind::RELU, + SOFIE::OperatorKind::SIGMOID, + SOFIE::OperatorKind::TANH, + SOFIE::OperatorKind::SOFTMAX, + SOFIE::OperatorKind::LEAKYRELU, + SOFIE::OperatorKind::EINSUM, + SOFIE::OperatorKind::COMPARISON, + SOFIE::OperatorKind::ELU, + SOFIE::OperatorKind::UNARY_RECIPROCAL, + SOFIE::OperatorKind::UNARY_SQRT, + SOFIE::OperatorKind::UNARY_NEG, + SOFIE::OperatorKind::UNARY_EXP, + SOFIE::OperatorKind::UNARY_LOG, + SOFIE::OperatorKind::UNARY_SIN, + SOFIE::OperatorKind::UNARY_COS, + SOFIE::OperatorKind::UNARY_ABS + }; + + bool OpNeedsBlas = false; + + // Generate kernel struct declarations, accounting for elementwise fusion groups. + // For fused chains (≥2 elementwise ops), a single FusedEltwiseKernel is generated + // instead of individual kernel structs for the participating ops. + fGC += "\n//--- ALPAKA Kernels\n"; + for (size_t id = 0; id < fOperators.size(); id++) { + if(fOperators[id]->GetKind() == OperatorKind::GEMM || fOperators[id]->GetKind() == OperatorKind::CONV) { + OpNeedsBlas = true; + } + + auto gIt = fOpToFusionGroupIdx.find(id); + size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX; + bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused(); + + if (inFusedGroup) { + // Only emit the fused kernel struct once, at the chain leader + if (fEltwiseFusionGroups[gIdx].opIndices[0] == id && !fusedGroupsEmitted.count(gIdx)) { + const auto& grp = fEltwiseFusionGroups[gIdx]; + std::string sfx = grp.suffix(); + fGC += "\n//------ FUSED_ELTWISE_KERNEL" + sfx + "\n"; + fGC += "struct FusedEltwiseKernel" + sfx + " {\n"; + fGC += SP + "template\n"; + fGC += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t n) const {\n"; + fGC += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + fGC += SP + SP + "if (idx < n) {\n"; + fGC += SP + SP + SP + "T v = data[idx];\n"; + for (size_t opIdx : grp.opIndices) + fGC += SP + SP + SP + "v = " + fOperators[opIdx]->GetElementwiseExpr("v") + ";\n"; + fGC += SP + SP + SP + "out[idx] = v;\n"; + fGC += SP + SP + "}\n"; + fGC += SP + "}\n"; + fGC += "};\n"; + fusedGroupsEmitted.insert(gIdx); + } + // Chain followers: skip (their logic is inside the fused kernel) + } else { + // Unfused op: generate individual kernel struct (with dedup for single_initialized_operators) + if (single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) { + if (registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + if (fVerbose) + std::cout << "Generating ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } else { + if (fVerbose) + std::cout << "Generating ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id)); + } + } + } + + // Emit a namespace-scope helper that avoids alpaka::getValidWorkDiv (which calls + // cudaFuncGetAttributes via a void* cast — broken on CUDA 12.x for JIT kernels). + fGC += "\ntemplate\n"; + fGC += "inline alpaka::WorkDivMembers sofie_workdiv(\n"; + fGC += " alpaka::Vec const& numElems, TIdx blockSz = TIdx{256})\n{\n"; + fGC += " auto const numBlocks = alpaka::Vec::all(\n"; + fGC += " (numElems[0] + blockSz - TIdx{1}) / blockSz);\n"; + fGC += " return alpaka::WorkDivMembers(\n"; + fGC += " numBlocks,\n"; + fGC += " alpaka::Vec::all(blockSz),\n"; + fGC += " alpaka::Vec::all(TIdx{1}));\n"; + fGC += "}\n\n"; + + // define the Session struct (for GNN this is generated in RModel_GNN) + fGC += "\n\ntemplate \n"; + if (fUseSession) { + if (!fIsSubGraph) + fGC += "struct Session {\n\n"; + else + fGC += "struct Session_" + fName + " {\n\n"; + } + + // define host and device accelerators + fGC += "using Idx = std::size_t;\n"; + fGC += "using Dim = alpaka::DimInt<1>;\n"; + fGC += "using Acc = alpaka::TagToAcc;\n"; + fGC += "using DevAcc = alpaka::Dev;\n\n"; + fGC += "using QueueProperty = alpaka::NonBlocking;\n"; + fGC += "using QueueAcc = alpaka::Queue;\n\n"; + fGC += "using BufF1D = alpaka::Buf;\n"; + fGC += "using BufD1D = alpaka::Buf;\n"; + fGC += "using BufI641D = alpaka::Buf;\n"; + fGC += "using BufUI81D = alpaka::Buf;\n\n"; + fGC += "// Non-owning device view types (ViewPlainPtr) for the span-based infer interface\n"; + fGC += "using ViewF1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstF1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewD1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstD1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewI641D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstI641D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewUI81D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstUI81D = alpaka::ViewPlainPtr;\n\n"; + + fGC += "\nalpaka::Platform const platform{};\n"; + fGC += "DevAcc devAcc = alpaka::getDevByIdx(platform, 0);\n"; + fGC += "alpaka::PlatformCpu platformHost{};\n"; + fGC += "alpaka::DevCpu hostAcc = alpaka::getDevByIdx(platformHost, 0);\n"; + fGC += "QueueAcc queue{devAcc};\n"; + fGC += "Idx threadsPerBlock = 256;\n"; + fGC += "\nusing Ext1D = alpaka::Vec;\n"; + fGC += "using Vec = alpaka::Vec;\n"; + if (OpNeedsBlas) { + fGC += "\n\n// BLAS declarations\n"; + fGC += "sofieBLAS blas{queue};\n"; + } + + GenerateInitializedTensorInfo_GPU_ALPAKA(); + GenerateGPU_ALPAKA_Buffers(); + GenerateOperatorDeclarations(); + + // add subgraph session + if (!fSubGraphs.empty()) + fGC += "// subgraph sessions\n"; + for (auto &graph : fSubGraphs) { + fGC += "Session_" + graph->fName + " fSession_" + graph->fName + ";\n"; + } + + // Session constructor + if (fUseSession) { + std::string sessionName = "\n\nSession"; + if (fIsSubGraph) + sessionName += "_" + fName; + + if (fUseWeightFile) { + std::string fileName = fName; + if (fWeightFile == WeightFileType::Text) + fileName += ".dat"; + if (fWeightFile == WeightFileType::RootBinary) + fileName += ".root"; + + fGC += sessionName + "(std::string filename =\"" + fileName + "\""; + } else { + fGC += sessionName + "(std::string = \"\""; + } + + if (!fShapeParams.empty()) { + for (auto &p : fShapeParams) { + fGC += ",\n"; + fGC += " size_t " + p.first + " = " + p.second; + } + } + fGC += ") {\n"; + + GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + if (fUseWeightFile) { + fGC += "\n//--- reading weights from file\n"; + ReadInitializedTensorsFromFile(0); + fGC += "\n"; + } + + MoveInitializedTensorsToBuffers_ALPAKA(); + GenerateDynamicTensorInfo_GPU_ALPAKA(); + + for (size_t id = 0; id < fOperators.size(); id++) { + fGC += fOperators[id]->GenerateInitCode_GPU_ALPAKA(); + if (fOperators[id]->GetKind() == OperatorKind::GEMM || fOperators[id]->GetKind() == OperatorKind::CONV) { + fGC += "\nblas.addLayoutConfig("+fOperators[id]->GetBlasConfig()+");\n"; + } + } + + fGC += "\nalpaka::wait(queue);\n"; + fGC += "}\n\n"; + } + + registered_operators.clear(); + fusedGroupsEmitted.clear(); + + for (size_t id = 0; id < fOperators.size(); id++) { + auto gIt = fOpToFusionGroupIdx.find(id); + size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX; + bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused(); + + if (inFusedGroup) { + if (fEltwiseFusionGroups[gIdx].opIndices[0] == id && !fusedGroupsEmitted.count(gIdx)) { + std::string sfx = fEltwiseFusionGroups[gIdx].suffix(); + fGC += SP + "FusedEltwiseKernel" + sfx + " fusedEltwiseKernel" + sfx + ";\n"; + fusedGroupsEmitted.insert(gIdx); + } + } else { + if (single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) { + if (registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + if (fVerbose) + std::cout << "Declaring ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } else { + if (fVerbose) + std::cout << "Declaring ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id)); + } + } + } + + GenerateOutput_GPU_ALPAKA(); + + if (fUseSession && !fIsGNNComponent) { + fGC += "}; // end of Session\n"; + } +} + +void RModel::GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize, bool verbose) { + fVerbose = true; + fBatchSize = batchSize; + + if (static_cast>(Options::kNoSession) & options) { + fUseSession = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kNoWeightFile) & options) { + fUseWeightFile = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kRootBinaryWeightFile) & options) { + fUseWeightFile = true; + fWeightFile = WeightFileType::RootBinary; + } + if (fUseWeightFile && !fUseSession) { + throw std::runtime_error( + "sofie: RModel::Generate: cannot use a separate weight file without generating a Session class"); + } + + if (static_cast>(Options::kGNN) & options || + static_cast>(Options::kGNNComponent) & options) + throw std::runtime_error("SOFIE GPU does not yet supports GNN Inference."); + + Initialize(batchSize, verbose); + ComputeEltwiseFusionGroups(); + + std::string hgname; + if (!fIsSubGraph) { + fGC.clear(); + GenerateHeaderInfo_GPU_ALPAKA(hgname); + } + + if (fVerbose) + std::cout << "generate Main session code - model " << fName << std::endl; + + GenerateSessionCode_GPU_ALPAKA(); + + if (!fIsSubGraph) { + fGC += ("} //SOFIE_" + fName + "\n"); + fGC += "\n#endif // " + hgname + "\n"; + } +} + +void RModel::MoveInitializedTensorsToBuffers_ALPAKA(){ + for (auto &i : fInitializedTensors) { + if (i.second.IsNotWritable()) continue; + std::string tensor_name = "tensor_" + i.first; + auto length = ConvertShapeToLength(i.second.shape()); + std::string slength = std::to_string(length); + // Use the 3-argument createView(dev, container, extent) which calls std::data() + // internally — works for both std::vector and raw C arrays. + fGC += " auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+", " + slength + ");\n"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } + } + +} // namespace SOFIE diff --git a/src/SOFIE_core/src/RModel_Base.cxx b/core/src/RModel_Base.cxx similarity index 60% rename from src/SOFIE_core/src/RModel_Base.cxx rename to core/src/RModel_Base.cxx index d4d1f1c..9c49e37 100644 --- a/src/SOFIE_core/src/RModel_Base.cxx +++ b/core/src/RModel_Base.cxx @@ -32,9 +32,16 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) { fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n"; if (fUseWeightFile) fGC += "#include \n"; - // Include TFile when saving the weights in a binary ROOT file - if (fWeightFile == WeightFileType::RootBinary) - fGC += "#include \"TFile.h\"\n"; + + if (fWeightFile == WeightFileType::RootBinary){ + #ifdef SOFIE_SUPPORT_ROOT_BINARY + // Include TFile when saving the weights in a binary ROOT file + fGC += "#include \"TFile.h\"\n"; + #else + throw std::runtime_error("sofie: ROOT binary weight file option is enabled but the code is not compiled with ROOT support"); + #endif + + } fGC += "\nnamespace SOFIE_" + fName + "{\n"; if (!fNeededBlasRoutines.empty()) { @@ -58,6 +65,45 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) { } } +void RModel_Base::GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname) { + fGC += ("//Code generated automatically by TMVA for GPU Inference using ALPAKA of Model file [" + fFileName + "] at [" + fParseTime.substr(0, fParseTime.length()-1) +"] \n"); + // add header guards + hgname = fName; + std::transform(hgname.begin(), hgname.end(), hgname.begin(), [](unsigned char c) { + return std::toupper(c); + } ); + hgname = "SOFIE_" + hgname; + fGC += "\n#ifndef " + hgname + "\n"; + fGC += "#define " + hgname + "\n\n"; + for (auto& i: fNeededStdLib) { + fGC += "#include <" + i + ">\n"; + } + for (auto& i: fCustomOpHeaders) { + fGC += "#include \"" + i + "\"\n"; + } + fGC += "#include \n"; + fGC += "#include \n"; + fGC += "#include \n"; + + // for the session we need to include SOFIE_Common functions + //needed for convolution operator (need to add a flag) + fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n"; + if (fUseWeightFile) + fGC += "#include \n"; + + if (fWeightFile == WeightFileType::RootBinary){ + #ifdef SOFIE_SUPPORT_ROOT_BINARY + // Include TFile when saving the weights in a binary ROOT file + fGC += "#include \"TFile.h\"\n"; + #else + throw std::runtime_error("sofie: ROOT binary weight file option is enabled but the code is not compiled with ROOT support"); + #endif + } + + fGC += "\nusing Dim1D = alpaka::DimInt<1>;\n"; + fGC += "\nnamespace SOFIE_" + fName + "{\n"; +} + void RModel_Base::OutputGenerated(std::string filename, bool append) { // the model can be appended only if a file name is provided if (filename.empty()) { @@ -71,7 +117,7 @@ void RModel_Base::OutputGenerated(std::string filename, bool append) { else f.open(filename); if (!f.is_open()) { - throw std::runtime_error("tmva-sofie failed to open file for output generated inference code"); + throw std::runtime_error("sofie failed to open file for output generated inference code"); } f << fGC; f.close(); diff --git a/src/SOFIE_core/src/RModel_GNN.cxx b/core/src/RModel_GNN.cxx similarity index 98% rename from src/SOFIE_core/src/RModel_GNN.cxx rename to core/src/RModel_GNN.cxx index a1dfe06..3dae254 100644 --- a/src/SOFIE_core/src/RModel_GNN.cxx +++ b/core/src/RModel_GNN.cxx @@ -94,7 +94,7 @@ void RModel_GNN::Generate() { // the number of output edges features can be smaller, so we need to correct here auto num_edge_features_input = num_edge_features; - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -117,7 +117,7 @@ void RModel_GNN::Generate() { // we need to correct the output number of node features auto num_node_features_input = num_node_features; - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/RModel_GraphIndependent.cxx b/core/src/RModel_GraphIndependent.cxx similarity index 97% rename from src/SOFIE_core/src/RModel_GraphIndependent.cxx rename to core/src/RModel_GraphIndependent.cxx index bab06b3..cd62d0c 100644 --- a/src/SOFIE_core/src/RModel_GraphIndependent.cxx +++ b/core/src/RModel_GraphIndependent.cxx @@ -81,7 +81,7 @@ void RModel_GraphIndependent::Generate() { // the number of output edges features can be smaller, so we need to correct here // assume num_edge_features is not a parametric shape - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -100,7 +100,7 @@ void RModel_GraphIndependent::Generate() { fGC+="};\n}\n"; // we need to correct the output number of node features - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } @@ -119,7 +119,7 @@ void RModel_GraphIndependent::Generate() { // we need to correct the output number of global features // global features are in shape[1] #if 0 - auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDynamicTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDimTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!globals_update_output_shape[1].isParam && globals_update_output_shape[1].dim != num_global_features_input) { num_global_features = globals_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/core/src/SOFIE_common.cxx similarity index 50% rename from src/SOFIE_core/src/SOFIE_common.cxx rename to core/src/SOFIE_common.cxx index ad74313..a2bafde 100644 --- a/src/SOFIE_core/src/SOFIE_common.cxx +++ b/core/src/SOFIE_common.cxx @@ -1,15 +1,18 @@ #include "SOFIE/SOFIE_common.hxx" -#include + +#include #include #include +#include +#include +#include - -namespace SOFIE{ +namespace SOFIE { /// @brief Convert shape from integer format to dynamic one (based on Dim) /// @param shape /// @return shape based on Dim -std::vector ConvertShapeToDim(std::vector shape){ +std::vector ConvertShapeToDim(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ ret_shape[i].dim = shape[i]; @@ -20,7 +23,7 @@ std::vector ConvertShapeToDim(std::vector shape){ /// @brief Convert shape based on Dim to integer format /// @param shape /// @return shape based on integer. Return an empty shape in case shape is dynamic (has a parameter) -std::vector ConvertShapeToInt(std::vector shape){ +std::vector ConvertShapeToInt(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ if (shape[i].isParam) { @@ -46,18 +49,35 @@ std::vector ConvertShapeToInt(std::vector shape){ } -std::size_t ConvertShapeToLength(std::vector shape){ +std::size_t ConvertShapeToLength(const std::vector & shape){ // Empty shape represent scalar values, so we return a length=1 std::size_t fLength = 1; for (auto& dim: shape) fLength *= dim; return fLength; } +std::size_t ConvertShapeToLength(const std::vector & shape){ + // convert generic shape to a string + // multiply all the integer specified dimensions of the shape + std::size_t length = 1; + for (size_t i = 0; i < shape.size(); i++) { + if (!shape[i].isParam) { + length *= shape[i].dim; + } else { + return static_cast(-1); // return -1 in case of parametric shapes + } + } + return length; +} + std::string ConvertTypeToString(ETensorType type){ switch(type){ case ETensorType::FLOAT : { return "float"; } + case ETensorType::INT8 : { + return "int8_t"; + } case ETensorType::INT16 : { return "int16_t"; } @@ -67,6 +87,9 @@ std::string ConvertTypeToString(ETensorType type){ case ETensorType::INT64 : { return "int64_t"; } + case ETensorType::UINT8 : { + return "uint8_t"; + } case ETensorType::UINT16 : { return "uint16_t"; } @@ -80,7 +103,7 @@ std::string ConvertTypeToString(ETensorType type){ return "double"; } case ETensorType::BOOL : { - return "bool"; + return "uint8_t"; } default:{ return "other_" + std::to_string( (int) type); @@ -106,7 +129,7 @@ ETensorType ConvertStringToType(std::string type){ } } -std::string ConvertShapeToString(std::vector shape) { +std::string ConvertShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { @@ -117,41 +140,49 @@ std::string ConvertShapeToString(std::vector shape) { return out.str(); } -std::string ConvertDynamicShapeToString(std::vector shape) { +std::string ConvertDimShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { - out << shape[i].GetVal(); + out << shape[i]; if (i < shape.size()-1) out << " , "; } out << " }"; return out.str(); } -std::string ConvertDynamicShapeToLength(std::vector shape) { +std::string ConvertDimShapeToLength(const std::vector & shape) { // convert generic shape to a string // multiply all the integer specified dimensions of the shape std::string length; - size_t int_length = 0; + // case of empty vectors return 1 + if (shape.empty()) return "1"; + int64_t int_length = -1; for (size_t i = 0; i < shape.size(); i++) { if (shape[i].isParam) { if (!length.empty()) length += " * "; length += shape[i].param; } else { - if (int_length == 0) + if (int_length == -1) int_length = shape[i].dim; else int_length *= shape[i].dim; } } // multiply the integer components to the parametric one - if (int_length > 0) { - if (!length.empty()) length += " * "; - length += std::to_string(int_length); + // if larger than 1 - otherwise returns -1 + if (int_length >= 0) { + if (!length.empty() && int_length > 1) { + length += " * "; + length += std::to_string(int_length); + } else if (length.empty()) { // case is full known shape + length = std::to_string(int_length); + } } return length; } + namespace{ template static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* input, T* target){ //only visible within this translation unit @@ -169,6 +200,12 @@ static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* inp } } +bool IsInteger(const std::string & s) { + int value; + auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), value); + return ec == std::errc() && ptr == s.data() + s.size(); +} + bool UTILITY::AreSameShape(const std::vector& shapeA, const std::vector& shapeB) { if (shapeA.size() != shapeB.size()) { return false; @@ -330,17 +367,24 @@ std::vector UTILITY::MultidirectionalBroadcastShape(std::vector UTILITY::UnidirectionalBroadcastShape(std::vector shapeA, std::vector shapeB) +// check multi-directional broadcasting of two shapes (need to pass inputs by non const ref. since we might prepends with one's +// return a pair of integer flag and new broadcasted shape +// if flag = 0: shape are identical +// flag = 1: return shape is equal to A, we broadcast B +// flag = 2: return shape is equal to B we broadcast A +// flag = 3: return shape is common of two we broadcast A and B to output +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { size_t sizeA = shapeA.size(); size_t sizeB = shapeB.size(); // Check if A and B have the same shape if (UTILITY::AreSameShape(shapeA, shapeB)){ - return shapeA; + return std::make_pair(0, shapeA); } // Find the common shape of A and B size_t size = std::max(sizeA, sizeB); if (sizeA < size) { + // prepend 1's in A to make of same shape as B std::vector newShapeA(size, 1); size_t offset = size - sizeA; std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); @@ -359,36 +403,117 @@ std::vector UTILITY::UnidirectionalBroadcastShape(std::vector s break; } } + int broadcastFlag = 0; if (broadcastable) { // The output shape is max(outShape, targetShape) std::vector targetShape(size, 1); for (size_t i = 0; i < size; i++) { targetShape[i] = std::max(shapeA[i], shapeB[i]); + if (shapeB[i] < targetShape[i]) broadcastFlag |= 1; + if (shapeA[i] < targetShape[i]) broadcastFlag |= 2; } - return targetShape; + return std::make_pair(broadcastFlag, targetShape); } else { throw - std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB) + " to a common shape."); } } +// unidirectional broadcast- of shape A to target B +std::vector UTILITY::UnidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) +{ + auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA); + if (ret.first > 1) { + throw + std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + + ConvertShapeToString(shapeA) + " to " + ConvertShapeToString(shapeB) + + " in a common shape."); + } + return ret.second; +} + +// for broadcasting Dim shapes +// flag indicates also which vector needs to be broadcasted +// flag & 1 == 1 : broadcast B -> A +// flag & 2 == 2 : broadcast A -> B +// flag & 4 == 4 a run time check is needed on shapes with values +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { + size_t sizeA = shapeA.size(); + size_t sizeB = shapeB.size(); + // Check if A and B have the same shape + if (UTILITY::AreSameShape(shapeA, shapeB)){ + return std::make_pair(0, shapeA); + } + // Find the common shape of A and B + size_t size = std::max(sizeA, sizeB); + if (sizeA < size) { + // prepend 1's in A to make of same shape as B + std::vector newShapeA(size, Dim{1}); + size_t offset = size - sizeA; + std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); + shapeA = std::move(newShapeA); + } + if (sizeB < size) { + std::vector newShapeB(size, Dim{1}); + size_t offset = size - sizeB; + std::copy(shapeB.begin(), shapeB.end(), newShapeB.begin() + offset); + shapeB = std::move(newShapeB); + } + + int broadcastFlag = 0; + // The output shape is targetShape + std::vector targetShape(size); + for (size_t i = 0; i < size; i++) { + // assume we broadcast to the parametric value + if (shapeA[i] == shapeB[i]) { + targetShape[i] = shapeA[i]; + } else if (shapeA[i].isParam && shapeB[i].GetVal() == "1" ) { + // broadcast B to A (case A is parametric with ) + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].GetVal() == "1" && shapeB[i].isParam) { + // broadcast A to B + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else if (!shapeA[i].isParam && !shapeB[i].isParam) { + if (shapeB[i].dim == 1) { + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].dim == 1) { + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else { + // non broadcastable case cannot have A and B two different defined shapes different than one + broadcastFlag = -1; + } + } else if (shapeA[i].isParam && shapeB[i].isParam) { + // full dynamic case - we will decided at run time + std::stringstream s; + s << "std::max(" << shapeA[i] << "," << shapeB[i] << ")"; + // use -1 for dim to indicate is an expression + targetShape[i] = Dim { s.str() , static_cast(-1)}; + broadcastFlag |= 4; + } else if (shapeA[i].isParam && !shapeB[i].isParam) { + // A -> B need to check at run time if consistent + targetShape[i] = shapeB[i]; + broadcastFlag |= 6; + } else if (!shapeA[i].isParam && shapeB[i].isParam) { + // B -> A need to check at run time if consistent + targetShape[i] = shapeA[i]; + broadcastFlag |= 5; + } else { + // all cases should be covered + throw std::runtime_error("TMVA::SOFIE - Fatal error in MultiDirectionalBroadCastDimShape"); + } + } + if (broadcastFlag == -1) { + throw std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + + ConvertDimShapeToString(shapeA) + " and " + ConvertDimShapeToString(shapeB) + + " to a common shape."); + } -// UNidirectional boradcast specializaiton for vector - -// specialization for vector of boolean -void UTILITY::UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData) - { - // Prepend shape with ones - auto ncdata = const_cast &>(data); - if (shape.size() < targetShape.size()) { - size_t targetSize = targetShape.size(); - std::vector newShape(targetSize, 1); - size_t offset = targetSize - shape.size(); - std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - UTILITY::BroadcastTensor &, std::vector &>(ncdata, newShape, targetShape, broadcastedData); - } - UTILITY::BroadcastTensor &, std::vector &>(ncdata, shape, targetShape, broadcastedData); + return std::make_pair(broadcastFlag, targetShape); } std::string UTILITY::Clean_name(std::string input_tensor_name){ @@ -413,15 +538,146 @@ std::vector UTILITY::ComputeStrideFromShape(const std::vector & shape) // assume row major layout const auto size = shape.size(); std::vector strides(size); - strides[size-1] = Dim{1}; - for (std::size_t i = 1; i < size; i++) { - if (!shape[size-i].isParam && !strides[size-i].isParam) - strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; - else - strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + if (size > 0) { + strides[size-1] = Dim{1}; + for (std::size_t i = 1; i < size; i++) { + if (!shape[size-i].isParam && !strides[size-i].isParam) + strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; + else { + if (strides[size-i].GetVal() == "1") + strides[size - 1 - i] = shape[size-i]; + else if (shape[size-i].GetVal() == "1") + strides[size - 1 - i] = strides[size-i]; + else + strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + } + } } return strides; } +struct FreeBlock { + std::size_t offset; + std::size_t size; + bool operator<(const FreeBlock& other) const { + // order by offset for deterministic coalescing + return offset < other.offset; + } +}; + +struct MemoryEvent { + int t; // time (i.e. operator index) + int type; // 0 = END first, 1 = START + int idx; // tensor index + bool operator<(const MemoryEvent& o) const { + if (t != o.t) return t < o.t; + return type < o.type; // END before START at the same time + } +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ) +{ + // Basic validation + for (const auto &t : tensorsInfo) { + if (!(t.end > t.begin)) { + throw std::runtime_error("Each tensor must have end > begin."); + } + } + + // Build events: free before allocate at equal times. + std::vector events; + events.reserve(tensorsInfo.size() * 2); + for (int i = 0; i < (int)tensorsInfo.size(); ++i) { + events.push_back({tensorsInfo[i].end, 0, i}); // END + events.push_back({tensorsInfo[i].begin, 1, i}); // START + } + std::sort(events.begin(), events.end()); + + std::vector tensorsOffset(tensorsInfo.size()); + + // Free list ordered by offset (for O(log n) coalescing) + // and faster insert/erase with respect to a vector + std::set free_list; + + // Bookkeeping: size/offset map for frees. + std::unordered_map live_size; + std::unordered_map live_offset; + + std::size_t total_bytes = 0; + + auto allocate_best_fit = [&](std::size_t need) -> std::size_t { + // Find the *smallest* block whose size >= need (best-fit). + // Since free_list is ordered by offset, we scan to find best by size. + // (For very large sets you could maintain a multimap by size as well.) + auto best = free_list.end(); + for (auto it = free_list.begin(); it != free_list.end(); ++it) { + if (it->size >= need) { + if (best == free_list.end() || it->size < best->size) + best = it; + } + } + if (best != free_list.end()) { + std::size_t off = best->offset; + if (best->size == need) { + free_list.erase(best); + } else { + FreeBlock updated{best->offset + need, best->size - need}; + free_list.erase(best); + free_list.insert(updated); + } + return off; + } + // No free block large enough; grow the heap. + std::size_t off = total_bytes; + total_bytes += need; + return off; + }; + + auto try_coalesce = [&](std::set::iterator it) { + // Coalesce with previous + if (it != free_list.begin()) { + auto prev = std::prev(it); + if (prev->offset + prev->size == it->offset) { + FreeBlock merged{prev->offset, prev->size + it->size}; + free_list.erase(prev); + it = free_list.erase(it); + it = free_list.insert(merged).first; + } + } + // Coalesce with next + auto next = std::next(it); + if (next != free_list.end() && it->offset + it->size == next->offset) { + FreeBlock merged{it->offset, it->size + next->size}; + free_list.erase(next); + it = free_list.erase(it); + free_list.insert(merged); + } + }; + + // Sweep through time. + for (const auto &e : events) { + if (e.type == 0) { // END: free + auto it_sz = live_size.find(e.idx); + auto it_off = live_offset.find(e.idx); + if (it_sz != live_size.end() && it_off != live_offset.end()) { + FreeBlock fb{it_off->second, it_sz->second}; + // Insert and coalesce with neighbors + auto it = free_list.insert(fb).first; + try_coalesce(it); + live_size.erase(it_sz); + live_offset.erase(it_off); + } + } else { // START: allocate + auto &t = tensorsInfo[e.idx]; + std::size_t off = allocate_best_fit(t.size); + tensorsOffset[e.idx] = off; + live_size[e.idx] = t.size; + live_offset[e.idx] = off; + } + } + + return MemoryResult{total_bytes, std::move(tensorsOffset)}; +} -}//SOFIE +} // namespace SOFIE diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt new file mode 100644 index 0000000..12f19b1 --- /dev/null +++ b/core/test/CMakeLists.txt @@ -0,0 +1,191 @@ +cmake_minimum_required(VERSION 3.14) +include(FetchContent) + +############################################################################ +# Basic setup +############################################################################ +include_directories(${CMAKE_SOURCE_DIR}/core/inc) +include_directories(${CMAKE_SOURCE_DIR}/parsers/inc) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if (NOT ONNX_MODELS_DIR) + set(ONNX_MODELS_DIR input_models) +endif() + +option(ENABLE_ALPAKA_TESTS "Enable Alpaka-based SOFIE tests" OFF) + +set(ALPAKA_BACKEND "cuda" + CACHE STRING "Alpaka backend to test (cuda, cpu, hip, sycl)") +set_property(CACHE ALPAKA_BACKEND PROPERTY STRINGS cuda cpu hip sycl) + +############################################################################ +# Generate emitter sources +############################################################################ +set(CAPTURE_STR +"try {\n\ + EmitModel(\"@1\", \"@2\");\n\ +} catch (const std::exception& e) {\n\ + std::string msg = e.what();\n\ + if (msg.find(\"multiple output tensors are not supported\") != std::string::npos) {\n\ + std::cerr << \"[SKIP] Multiple outputs are not supported for @1\" << std::endl;\n\ + } else if (msg.find(\"is of a data type which is not yet supported\") != std::string::npos) {\n\ + std::cerr << \"[SKIP] Operator with unsupported data type in @1: \" << msg << std::endl;\n\ + } else {\n\ + std::cerr << \"[ERROR] Failed processing @1: \" << msg << std::endl;\n\ + failures++;\n\ + }\n\ +} catch (...) {\n\ + std::cerr << \"[ERROR] Unknown failure processing @1\" << std::endl;\n\ + failures++;\n\ +}\n\ +") + +file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx") + +set(ALL_CAPTURES "") +foreach(onnx_file ${ONNX_FILES}) + get_filename_component(fname ${onnx_file} NAME_WE) + string(REPLACE "@1" "${onnx_file}" cap "${CAPTURE_STR}") + string(REPLACE "@2" "${fname}" cap "${cap}") + string(APPEND ALL_CAPTURES "${cap}") +endforeach() + +set(EMIT_CAPTURES "${ALL_CAPTURES}") + +configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY) +configure_file(EmitFromONNX_GPU_ALPAKA.cxx.in EmitFromONNX_GPU_ALPAKA_all.cxx @ONLY) + +############################################################################ +# Alpaka tests +############################################################################ +if (ENABLE_ALPAKA_TESTS) + + string(TOLOWER "${ALPAKA_BACKEND}" _alpaka_backend) + if (NOT _alpaka_backend IN_LIST ALPAKA_BACKEND) + message(FATAL_ERROR "Unsupported ALPAKA_BACKEND=${ALPAKA_BACKEND}") + endif() + + FetchContent_Declare( + sofieBLAS + GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS + GIT_TAG dev + ) + FetchContent_MakeAvailable(sofieBLAS) + + FetchContent_Declare( + alpaka + GIT_REPOSITORY https://github.com/alpaka-group/alpaka + GIT_TAG 2fa91a34ed11b2076e474c5507d920e85cf9b79d + ) + FetchContent_MakeAvailable(alpaka) + + ########################################################################## + # Alpaka emitter + ########################################################################## + ROOTTEST_GENERATE_EXECUTABLE( + emitFromONNXAlpaka + EmitFromONNX_GPU_ALPAKA_all.cxx + LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers + FIXTURES_SETUP sofie-compile-models-onnx-alpaka-build + ) + + target_compile_options(emitFromONNXAlpaka PRIVATE + -Wno-unused-parameter + -Wno-array-bounds + ) + + ROOTTEST_ADD_TEST( + SofieCompileModels_ONNX_Alpaka + COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNXAlpaka + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka-build + FIXTURES_SETUP sofie-compile-models-onnx-alpaka + ) + + ########################################################################## + # CUDA backend + ########################################################################## + if (_alpaka_backend STREQUAL "cuda") + + message(STATUS "Enabling Alpaka CUDA tests") + + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + set_source_files_properties( + TestCustomModelsFromONNXForAlpakaCuda.cxx + PROPERTIES LANGUAGE CUDA + ) + + ROOTTEST_GENERATE_EXECUTABLE( + TestCustomModelsFromONNXForAlpakaCuda + TestCustomModelsFromONNXForAlpakaCuda.cxx + LIBRARIES SOFIE_core GTest::gtest GTest::gtest_main + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka + FIXTURES_SETUP sofie-test-models-onnx-alpaka-build + ) + + target_include_directories( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ${CMAKE_CURRENT_BINARY_DIR} + ${alpaka_SOURCE_DIR}/include + ${sofieblas_SOURCE_DIR}/include + ${CUDAToolkit_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR} + ) + + set_target_properties( + TestCustomModelsFromONNXForAlpakaCuda + PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF + CUDA_ARCHITECTURES 70 80 86 + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + + target_compile_definitions( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ALPAKA_ACC_GPU_CUDA_ENABLED + ALPAKA_HAS_STD_ATOMIC_REF + ) + + target_compile_options( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + $<$: + --extended-lambda + --expt-relaxed-constexpr + --generate-line-info + --use_fast_math + -g + -G + # -fsanitize=address + -O1 + -Wno-deprecated-gpu-targets + > + $<$: + -O2 + -g + -G + -fPIC + -pthread + > + ) + # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address") + + # ROOT-compatible: plain signature only + target_link_libraries( + TestCustomModelsFromONNXForAlpakaCuda + CUDA::cudart + CUDA::cublas + CUDA::cublasLt + ) + + ROOTTEST_ADD_TEST( + TestCustomModelsFromONNXForAlpakaCuda + EXEC ./TestCustomModelsFromONNXForAlpakaCuda + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka + ) + + endif() # cuda backend +endif() # ENABLE_ALPAKA_TESTS diff --git a/src/SOFIE_core/test/Conv1dModelGenerator.py b/core/test/Conv1dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/Conv1dModelGenerator.py rename to core/test/Conv1dModelGenerator.py diff --git a/src/SOFIE_core/test/Conv2dModelGenerator.py b/core/test/Conv2dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/Conv2dModelGenerator.py rename to core/test/Conv2dModelGenerator.py diff --git a/src/SOFIE_core/test/Conv3dModelGenerator.py b/core/test/Conv3dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/Conv3dModelGenerator.py rename to core/test/Conv3dModelGenerator.py diff --git a/src/SOFIE_core/test/ConvTrans2dModelGenerator.py b/core/test/ConvTrans2dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/ConvTrans2dModelGenerator.py rename to core/test/ConvTrans2dModelGenerator.py diff --git a/src/SOFIE_core/test/EmitFromONNX.cxx.in b/core/test/EmitFromONNX.cxx.in similarity index 77% rename from src/SOFIE_core/test/EmitFromONNX.cxx.in rename to core/test/EmitFromONNX.cxx.in index f7a56e2..c464f4d 100644 --- a/src/SOFIE_core/test/EmitFromONNX.cxx.in +++ b/core/test/EmitFromONNX.cxx.in @@ -23,7 +23,13 @@ int EmitModel(std::string filename, std::string outname) { int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/core/test/EmitFromONNX_GPU_ALPAKA.cxx.in b/core/test/EmitFromONNX_GPU_ALPAKA.cxx.in new file mode 100644 index 0000000..58198c1 --- /dev/null +++ b/core/test/EmitFromONNX_GPU_ALPAKA.cxx.in @@ -0,0 +1,27 @@ +// Author: Sanjiban Sengupta + +#include "SOFIE/RModel_Base.hxx" +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelParser_ONNX.hxx" + +using namespace SOFIE; + +int EmitModel(std::string filename, std::string outname) { + + RModelParser_ONNX parser; + RModel model = parser.Parse(filename); + model.GenerateGPU_ALPAKA(); + model.OutputGenerated(outname+"_FromONNX_GPU_ALPAKA.hxx"); + + return 0; +} + +int main(int argc, char *argv[]) { + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX with ALPAKA] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; +} diff --git a/src/SOFIE_core/test/EmitFromRoot.cxx.in b/core/test/EmitFromRoot.cxx.in similarity index 83% rename from src/SOFIE_core/test/EmitFromRoot.cxx.in rename to core/test/EmitFromRoot.cxx.in index 4a630c7..88c0789 100644 --- a/src/SOFIE_core/test/EmitFromRoot.cxx.in +++ b/core/test/EmitFromRoot.cxx.in @@ -43,6 +43,15 @@ int EmitModel(std::string inputfile, std::string outname){ int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + int failures = 0; + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ROOT] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; + + @EMIT_CAPTURES@; + + std::cout << "[SUMMARY] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/src/SOFIE_core/test/GNN/EmitGNN.cxx b/core/test/GNN/EmitGNN.cxx similarity index 100% rename from src/SOFIE_core/test/GNN/EmitGNN.cxx rename to core/test/GNN/EmitGNN.cxx diff --git a/src/SOFIE_core/test/GNN/EmitGraphIndependent.cxx b/core/test/GNN/EmitGraphIndependent.cxx similarity index 100% rename from src/SOFIE_core/test/GNN/EmitGraphIndependent.cxx rename to core/test/GNN/EmitGraphIndependent.cxx diff --git a/src/SOFIE_core/test/LinearModelGenerator.py b/core/test/LinearModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/LinearModelGenerator.py rename to core/test/LinearModelGenerator.py diff --git a/src/SOFIE_core/test/RecurrentModelGenerator.py b/core/test/RecurrentModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/RecurrentModelGenerator.py rename to core/test/RecurrentModelGenerator.py diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx b/core/test/TestCustomModelsFromONNX.cxx similarity index 99% rename from src/SOFIE_core/test/TestCustomModelsFromONNX.cxx rename to core/test/TestCustomModelsFromONNX.cxx index d02dc5e..902cbcc 100644 --- a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx +++ b/core/test/TestCustomModelsFromONNX.cxx @@ -812,7 +812,7 @@ TEST(ONNX, LinearWithLeakyRelu) { constexpr float TOLERANCE = 1; - // Preparing the standard all-ones input + // Preparing input std::vector input({ 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, @@ -2515,7 +2515,7 @@ TEST(ONNX, Equal){ }); SOFIE_Equal::Session s("Equal_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool)); @@ -2540,7 +2540,7 @@ TEST(ONNX, LessOrEqual){ }); SOFIE_LessOrEqual::Session s("LessOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2565,7 +2565,7 @@ TEST(ONNX, GreaterOrEqual){ }); SOFIE_GreaterOrEqual::Session s("GreaterOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2590,7 +2590,7 @@ TEST(ONNX, Greater){ }); SOFIE_Greater::Session s("Greater_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool)); @@ -2615,7 +2615,7 @@ TEST(ONNX, Less){ }); SOFIE_Less::Session s("Less_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Less_ExpectedOutput::outputs) / sizeof(bool)); @@ -2849,6 +2849,7 @@ TEST(ONNX, Slice_Neg) { } } + TEST(ONNX, RangeFloat) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; diff --git a/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx new file mode 100644 index 0000000..e415cce --- /dev/null +++ b/core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -0,0 +1,2608 @@ +#include +#include + +#include "Linear_64_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Linear_64.ref.hxx" + +#include "AddBroadcast1_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/AddBroadcast1.ref.hxx" + +#include "LinearWithLeakyRelu_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithLeakyRelu.ref.hxx" + +#include "LinearWithSigmoid_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithSigmoid.ref.hxx" + +#include "Transpose_FromONNX_GPU_ALPAKA.hxx" + +#include "Concat_0D_FromONNX_GPU_ALPAKA.hxx" +#include "ScatterElements_FromONNX_GPU_ALPAKA.hxx" + +#include "Split_0_FromONNX_GPU_ALPAKA.hxx" +#include "Split_1_FromONNX_GPU_ALPAKA.hxx" +#include "Split_2_FromONNX_GPU_ALPAKA.hxx" + +#include "Tile5D_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Tile5D.ref.hxx" + +#include "GatherAxis0_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis1_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis2_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis3_FromONNX_GPU_ALPAKA.hxx" +#include "Gather2d_FromONNX_GPU_ALPAKA.hxx" +#include "GatherNegativeIndices_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/GatherAxis0.ref.hxx" +#include "input_models/references/GatherAxis1.ref.hxx" +#include "input_models/references/GatherAxis2.ref.hxx" +#include "input_models/references/GatherAxis3.ref.hxx" +#include "input_models/references/Gather2d.ref.hxx" +#include "input_models/references/GatherNegativeIndices.ref.hxx" + +#include "ExpandSameSize_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ExpandSameSize.ref.hxx" + +#include "ExpandDiffSize_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ExpandDiffSize.ref.hxx" + +#include "GatherND_Ex1_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex2_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex3_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex4_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex5_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Batch_FromONNX_GPU_ALPAKA.hxx" + +#include "Equal_FromONNX_GPU_ALPAKA.hxx" +#include "LessOrEqual_FromONNX_GPU_ALPAKA.hxx" +#include "GreaterOrEqual_FromONNX_GPU_ALPAKA.hxx" +#include "Greater_FromONNX_GPU_ALPAKA.hxx" +#include "Less_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Equal.ref.hxx" +#include "input_models/references/LessOrEqual.ref.hxx" +#include "input_models/references/GreaterOrEqual.ref.hxx" +#include "input_models/references/Greater.ref.hxx" +#include "input_models/references/Less.ref.hxx" + +#include "Slice_FromONNX_GPU_ALPAKA.hxx" +#include "Slice_Default_Axis_FromONNX_GPU_ALPAKA.hxx" +#include "Slice_Default_Steps_FromONNX_GPU_ALPAKA.hxx" +#include "Slice_Neg_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Slice.ref.hxx" +#include "input_models/references/Slice_Default_Axis.ref.hxx" +#include "input_models/references/Slice_Default_Steps.ref.hxx" +#include "input_models/references/Slice_Neg.ref.hxx" + +#include "Sin_FromONNX_GPU_ALPAKA.hxx" +#include "Cos_FromONNX_GPU_ALPAKA.hxx" +#include "Abs_FromONNX_GPU_ALPAKA.hxx" +#include "Sqrt_FromONNX_GPU_ALPAKA.hxx" +#include "Reciprocal_FromONNX_GPU_ALPAKA.hxx" +#include "Exp_FromONNX_GPU_ALPAKA.hxx" +#include "Log_FromONNX_GPU_ALPAKA.hxx" +#include "Neg_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Sqrt.ref.hxx" +#include "input_models/references/Reciprocal.ref.hxx" +#include "input_models/references/Exp.ref.hxx" +#include "input_models/references/Log.ref.hxx" +#include "input_models/references/Neg.ref.hxx" + +#include "Where_FromONNX_GPU_ALPAKA.hxx" + +#include "Softplus_FromONNX_GPU_ALPAKA.hxx" + +#include "ReduceMean_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceProd_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceSum_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceSumSquare_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ReduceMean.ref.hxx" +#include "input_models/references/ReduceProd.ref.hxx" + +#include "ConvWithPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithPadding.ref.hxx" + +#include "ConvWithoutPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithoutPadding.ref.hxx" + +#include "ConvWithAutopadSameLower_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithAutopadSameLower.ref.hxx" + +#include "ConvWithStridesPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithStridesPadding.ref.hxx" + +#include "ConvWithStridesNoPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithStridesNoPadding.ref.hxx" + +#include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" + +#include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" +#include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" + +#include "LayerNorm_FromONNX_GPU_ALPAKA.hxx" +#include "LayerNormScaleBias_FromONNX_GPU_ALPAKA.hxx" +#include "LayerNorm3D_FromONNX_GPU_ALPAKA.hxx" + +#include "IsInf_FromONNX_GPU_ALPAKA.hxx" +#include "IsNaN_FromONNX_GPU_ALPAKA.hxx" +#include "Clip_FromONNX_GPU_ALPAKA.hxx" +#include "Not_FromONNX_GPU_ALPAKA.hxx" + +#include "GNN_model_FromONNX_GPU_ALPAKA.hxx" + +#include +#include +#include +#include "gtest/gtest.h" + +constexpr float DEFAULT_TOLERANCE = 1e-3f; + +using Idx = std::size_t; +using Dim = alpaka::DimInt<1>; +using Ext1D = alpaka::Vec; + +class SofieAlpakaTest : public ::testing::Test { +protected: + // Shared devices and platforms + alpaka::PlatformCpu hostPlatform; + alpaka::DevCpu host; + alpaka::PlatformCudaRt platform; + alpaka::DevCudaRt device; + alpaka::Queue queue; + + SofieAlpakaTest() + : hostPlatform{} + , host(alpaka::getDevByIdx(hostPlatform, 0u)) + , platform{} + , device(alpaka::getDevByIdx(platform, 0u)) + , queue(device) + { + } + + void SetUp() override { + cudaDeviceSynchronize(); + } + + void TearDown() override { + alpaka::wait(queue); + cudaDeviceSynchronize(); + } + + ~SofieAlpakaTest() override { + cudaDeviceSynchronize(); + } +}; + + +TEST_F(SofieAlpakaTest, Linear64) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{1600})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 1600; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{1600})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{160})); + + { + SOFIE_Linear_64::Session session("Linear_64_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = Linear_64_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 160; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, LinearWithLeakyRelu) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, + -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, + 0.7057, -0.3749, -0.3310, 0.0986, -0.1370, 0.0832, -1.6465, -0.2793 + }); + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < input.size(); ++i) { + A_ptr[i] = input[i]; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{24})); + + { + SOFIE_LinearWithLeakyRelu::Session session; + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithLeakyRelu_ExpectedOutput::outputs; + + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, LinearWithSigmoid) +{ + + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{48})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 48; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{48})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{24})); + + { + SOFIE_LinearWithSigmoid::Session session("LinearWithSigmoid_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithSigmoid_ExpectedOutput::all_ones; + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, AddBroadcast1) +{ + + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{5})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + auto B = alpaka::allocBuf(host, Ext1D::all(Idx{20})); + float *B_ptr = reinterpret_cast(alpaka::getPtrNative(B)); + + std::vector A_vec({-0.78023305, -1.34029483, -3.01482951, 0.53641361, + -1.22594789}); + std::vector B_vec({1.0626695, 0.43842875, 1.22476468, 0.79763274, 0.98688211, + 0.25267614, 0.44874883, 0.31516773, -0.78771195, 0.64565664, + 0.50450593, -0.41265227, -0.22474539, -0.22362374, 0.00509674, + 0.16927211, 1.06756969, -0.81634773, 0.88467744, 0.78902059}); + + for (Idx i = 0; i < A_vec.size(); ++i) { + A_ptr[i] = A_vec[i]; + } + + for (Idx i = 0; i < B_vec.size(); ++i) { + B_ptr[i] = B_vec[i]; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{5})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto B_d = alpaka::allocBuf(device, Ext1D::all(Idx{20})); + alpaka::memcpy(queue, B_d, B); + alpaka::wait(queue); + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{20})); + + { + SOFIE_AddBroadcast1::Session session; + auto result = session.infer(A_d, B_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = AddBroadcast1_ExpectedOutput::output; + for (size_t i = 0; i < 20; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Transpose) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Input shape: (2, 1, 3, 4) -> 24 elements + constexpr Idx inputSize = 24; + // Output shape: (2, 3, 4, 1) -> 24 elements + constexpr Idx outputSize = 24; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + + std::vector input_vec({ + // shape (2, 1, 3, 4) + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }); + + for (Idx i = 0; i < inputSize; ++i) + input_ptr[i] = input_vec[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Transpose::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + std::vector expected(outputSize); + std::vector inputShape = {2, 1, 3, 4}; + std::vector perm = {0, 2, 3, 1}; + std::vector outputShape = {2, 3, 4, 1}; + + std::vector inputStrides = {12, 12, 4, 1}; + std::vector outputStrides = {12, 4, 1, 1}; + + for (size_t i = 0; i < outputSize; ++i) + { + size_t remaining = i; + size_t inputIdx = 0; + for (size_t d = 0; d < 4; ++d) + { + size_t const coord = remaining / outputStrides[d]; + remaining = remaining - coord * outputStrides[d]; + inputIdx += coord * inputStrides[perm[d]]; + } + expected[i] = input_vec[inputIdx]; + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Concat0D) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.40519865e+00, -2.87660856e-01}); + std::vector expected_output({ + 1.40519865e+00, -2.87660856e-01, + 1.40519865e+00, -2.87660856e-01 + }); + + // Host input buffer + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + // Device input buffer + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // Host output buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected_output.size()})); + + { + SOFIE_Concat_0D::Session session("Concat_0D_FromONNX_GPU_ALPAKA.dat"); + + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + for (size_t i = 0; i < expected_output.size(); ++i) { + EXPECT_LE(std::abs(res_ptr[i] - expected_output[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, ScatterElements) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input (9, 0.f); + std::vector indices = { 1, 0, 2, 0, 2, 1 }; + std::vector updates = { 1.f, 1.1f, 1.2f, 2.f, 2.1f, 2.2f }; + std::vector correct = { 2.f, 1.1f, 0.f, 1.f, 0.f, 2.2f, 0.f, 2.1f, 1.2f }; + + // Allocate and fill host buffers + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + auto indices_h = alpaka::allocBuf(host, Ext1D::all(Idx{indices.size()})); + auto updates_h = alpaka::allocBuf(host, Ext1D::all(Idx{updates.size()})); + + float* input_ptr = reinterpret_cast (alpaka::getPtrNative(input_h)); + int64_t* indices_ptr = reinterpret_cast(alpaka::getPtrNative(indices_h)); + float* updates_ptr = reinterpret_cast (alpaka::getPtrNative(updates_h)); + + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + for (Idx i = 0; i < indices.size(); ++i) indices_ptr[i] = indices[i]; + for (Idx i = 0; i < updates.size(); ++i) updates_ptr[i] = updates[i]; + + // Allocate device buffers and copy + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + auto indices_d = alpaka::allocBuf(device, Ext1D::all(Idx{indices.size()})); + auto updates_d = alpaka::allocBuf(device, Ext1D::all(Idx{updates.size()})); + + alpaka::memcpy(queue, input_d, input_h); + alpaka::memcpy(queue, indices_d, indices_h); + alpaka::memcpy(queue, updates_d, updates_h); + alpaka::wait(queue); + + // Host result buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_ScatterElements::Session session; + auto result = session.infer(input_d, indices_d, updates_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(correct.size(), 9u); + for (size_t i = 0; i < correct.size(); ++i){ + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Split_0) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 0 in 2 tensors {2,2,3} -> {1,2,3} each + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,3.,4.,5.,6.}, {7.,8.,9.,10.,11.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_0::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Split_1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 1 in 2 tensors {2,2,3} -> {2,1,3} each + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,3.,7.,8.,9.}, {4.,5.,6.,10.,11.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_1::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Split_2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 2 in 2 tensors {2,2,3} -> {2,2,2} and {2,2,1} + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,4.,5.,7.,8.,10.,11.}, {3.,6.,9.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // outputs have different sizes: {2,2,2}=8 and {2,2,1}=4 + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_2::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Tile5D) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input_data({ + 0.2386120855808258, 0.5549510717391968, -1.8190287351608276, 0.5724563598632812, -0.6596977710723877, + 0.17560836672782898, 0.7608169317245483, 0.08603227883577347, -0.049375515431165695, 0.2705111503601074, + 1.42119562625885, 0.032626643776893616, -1.212586522102356, -0.5129594802856445, -0.43296414613723755, + -0.1606937050819397, 1.1884371042251587, -0.662174642086029, -2.291109323501587, -0.6852569580078125, + 2.325223922729492, -0.19389064610004425, -0.5784135460853577, -0.39328137040138245, 0.2831517457962036, + 0.4496127665042877, -0.2029038816690445, 0.35477763414382935, 0.4266718924045563, 0.24683749675750732, + 1.90426504611969, -0.4861580729484558, 0.9139055013656616, -0.5031066536903381, 0.9583520293235779, + -0.23210509121418, 1.3183971643447876, 1.7042455673217773, -0.3201166093349457, -0.14444805681705475, + -0.8829464912414551, 1.725736141204834, 0.45657631754875183, 0.4920198321342468, -1.088847041130066, + 0.49437597393989563, -0.006085286382585764, 2.475630760192871, 0.12170185893774033, -0.8953945636749268, + 1.1430096626281738, 1.3278610706329346, 0.3076854348182678, 0.036237504333257675, 0.05180325731635094, + 0.2802475392818451, 0.5289335250854492, 0.9356630444526672, 0.7863689064979553, 0.4239695370197296, + 0.8723016977310181, -0.2248474359512329, 0.3891502320766449, 0.5463842153549194, -0.7782878875732422, + -0.8570080399513245, -2.593783378601074, -0.11392943561077118, 0.5637082457542419, 2.075004816055298, + -1.0598397254943848, 1.0823975801467896 + }); + + const std::size_t inputSize = input_data.size(); + const std::size_t outputSize = sizeof(Tile5D_ExpectedOutput::output) / sizeof(float); + + // Allocate and fill host input buffer + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < inputSize; ++i) + input_ptr[i] = input_data[i]; + + // Copy to device + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // Host result buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Tile5D::Session session; + auto result = session.infer(input_d); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Tile5D_ExpectedOutput::output; + + EXPECT_EQ(outputSize, sizeof(Tile5D_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis0) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis0::Session session("GatherAxis0_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis0_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis1::Session session("GatherAxis1_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis1_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis2::Session session("GatherAxis2_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis2_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis3) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis3::Session session("GatherAxis3_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis3_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Gather2d) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 9; + const std::size_t outputSize = sizeof(Gather2d_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Gather2d::Session session("Gather2d_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Gather2d_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Gather2d_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherNegativeIndices) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 10; + const std::size_t outputSize = sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherNegativeIndices::Session session("GatherNegativeIndices_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherNegativeIndices_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, ExpandSameSize) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.f, 1.f, 2.f}); + const std::size_t outputSize = sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ExpandSameSize::Session session("ExpandSameSize_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ExpandSameSize_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, ExpandDiffSize) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.f, 1.f, 2.f}); + const std::size_t outputSize = sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ExpandDiffSize::Session session("ExpandDiffSize_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ExpandDiffSize_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherND_Ex1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f, 1.f, 2.f, 3.f}; + std::vector expected = {0.f, 3.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex1::Session session("GatherND_Ex1_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 2u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f, 1.f, 2.f, 3.f}; + std::vector expected = {2.f, 3.f, 0.f, 1.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex2::Session session("GatherND_Ex2_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex3) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex3::Session session("GatherND_Ex3_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex4) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex4::Session session("GatherND_Ex4_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex5) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex5::Session session("GatherND_Ex5_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_NegativeIndices) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f}; + std::vector expected = {6.f, 2.f, 4.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_NegativeIndices::Session session("GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 3u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Batch) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data(24); + std::iota(data.begin(), data.end(), 0.f); + std::vector expected = {4.f,5.f,6.f,7.f, 20.f,21.f,22.f,23.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Batch::Session session("GatherND_Batch_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 8u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Equal) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + // Output is bool — allocate as bool buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Equal::Session session("Equal_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = Equal_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, LessOrEqual) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LessOrEqual::Session session("LessOrEqual_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = LessOrEqual_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GreaterOrEqual) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GreaterOrEqual::Session session("GreaterOrEqual_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = GreaterOrEqual_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Greater) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Greater::Session session("Greater_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = Greater_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Less) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(Less_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Less::Session session("Less_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = Less_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Less_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice::input; + const std::size_t outputSize = sizeof(Slice::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice::Session session("Slice_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice::output; + EXPECT_EQ(outputSize, sizeof(Slice::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice_Default_Axis) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice_Default_Axis::input; + const std::size_t outputSize = sizeof(Slice_Default_Axis::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice_Default_Axis::Session session("Slice_Default_Axis_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice_Default_Axis::output; + EXPECT_EQ(outputSize, sizeof(Slice_Default_Axis::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice_Default_Steps) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice_Default_Steps::input; + const std::size_t outputSize = sizeof(Slice_Default_Steps::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice_Default_Steps::Session session("Slice_Default_Steps_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice_Default_Steps::output; + EXPECT_EQ(outputSize, sizeof(Slice_Default_Steps::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice_Neg) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice_Neg::input; + const std::size_t outputSize = sizeof(Slice_Neg::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice_Neg::Session session("Slice_Neg_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice_Neg::output; + EXPECT_EQ(outputSize, sizeof(Slice_Neg::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Sin) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + -0.786738f, -0.197796f, -0.187787f, 0.142758f, + 0.876096f, -0.653239f, 0.145444f, -1.107658f, + 2.259171f, -0.947054f, -0.506689f, 1.801250f + }); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Sin::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(input.size(), 12u); + for (size_t i = 0; i < input.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - std::sin(input[i])), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Cos) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 1.152504f, -1.459324f, 0.691594f, 0.347690f, + -1.307323f, 1.832516f, -1.261772f, 0.014224f, + 1.311477f, 1.147405f, -0.567206f, -0.530606f + }); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Cos::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(input.size(), 12u); + for (size_t i = 0; i < input.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - std::cos(input[i])), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Abs) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.f, -2.f, -3.f, 4.f, -5.f, 6.f}); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Abs::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(input.size(), 6u); + for (size_t i = 0; i < input.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - std::abs(input[i])), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Sqrt) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.8344f, 0.4716f, 0.6226f, 0.8448f, 0.2483f, 0.9467f}); + const std::size_t outputSize = sizeof(Sqrt_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Sqrt::Session session("Sqrt_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Sqrt_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Sqrt_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Reciprocal) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.2691f, -1.2160f, 0.6393f, -0.4438f, 0.8065f, 0.2011f}); + const std::size_t outputSize = sizeof(Reciprocal_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Reciprocal::Session session("Reciprocal_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Reciprocal_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Reciprocal_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Exp) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 1.46566453f, 0.63334515f, 2.4048165f, 0.54468453f, + -1.41271672f, -0.18609187f, 0.2754482f, 1.10615209f, + 0.88474389f, 0.47531232f + }); + const std::size_t outputSize = sizeof(Exp_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Exp::Session session("Exp_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Exp_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Exp_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Log) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.f, 2.f, 3.f, 4.f}); + const std::size_t outputSize = sizeof(Log_ExpectedOutput::outputs) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Log::Session session("Log_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Log_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Log_ExpectedOutput::outputs) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Neg) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + -1.9100f, 1.8811f, -1.7269f, -0.1094f, + -0.0145f, 0.2509f, 0.5893f, -2.2733f, + -0.7077f, 1.0645f, -0.8607f, 0.2085f + }); + const std::size_t outputSize = sizeof(Neg_ExpectedOutput::outputs) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Neg::Session session("Neg_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Neg_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Neg_ExpectedOutput::outputs) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Softplus) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.1,-0.2,0.3,-0.4,0.5,1.}); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Softplus::Session session("Softplus_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + for (size_t i = 0; i < input.size(); ++i){ + double exp_value = std::log(std::exp(input[i])+1); + EXPECT_LE(std::abs(res_ptr[i] - exp_value), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Where) +{ + std::vector input1 = {1.f, 2.f}; + std::vector input2 = {3.f, 4.f, 5.f, 6.f}; + std::vector cond_vec = {true, false, true}; + std::vector correct = {1.f, 2.f, 5.f, 6.f, 1.f, 2.f}; + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input2_d, input2_h); + + auto cond_h = alpaka::allocBuf(host, Ext1D::all(Idx{cond_vec.size()})); + uint8_t* cond_ptr = reinterpret_cast(alpaka::getPtrNative(cond_h)); + for (Idx i = 0; i < cond_vec.size(); ++i) cond_ptr[i] = cond_vec[i]; + + auto cond_d = alpaka::allocBuf(device, Ext1D::all(Idx{cond_vec.size()})); + alpaka::memcpy(queue, cond_d, cond_h); + + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_Where::Session session("Where_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d, cond_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(correct.size(), 6u); + for (size_t i = 0; i < correct.size(); ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceMean) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + const std::size_t outputSize = sizeof(ReduceMean_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ReduceMean::Session session("ReduceMean_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceMean_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ReduceMean_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceProd) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + const std::size_t outputSize = sizeof(ReduceProd_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ReduceProd::Session session("ReduceProd_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceProd_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ReduceProd_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceSum) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + std::vector correct = {24.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_ReduceSum::Session session("ReduceSum_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(correct.size(), 1u); + for (size_t i = 0; i < correct.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceSumSquare) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + std::vector correct = {38.f, 66.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_ReduceSumSquare::Session session("ReduceSumSquare_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + for (size_t i = 0; i < correct.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ConvWithPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(25); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithPadding::Session session("ConvWithPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithPadding_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 25; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +TEST_F(SofieAlpakaTest, ConvWithoutPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(25); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithoutPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithoutPadding::Session session("ConvWithoutPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithoutPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_convNoPad = sizeof(ConvWithoutPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_convNoPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } + +} + + +TEST_F(SofieAlpakaTest, ConvWithAutopadSameLower) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(25); + std::iota(input.begin(), input.end(), 0.0f); + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithAutopadSameLower_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithAutopadSameLower::Session session("ConvWithAutopadSameLower_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithAutopadSameLower_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 9; ++i) { + std::cout << "res: " << res_ptr[i] << ", correct: " << correct[i] << std::endl; + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +TEST_F(SofieAlpakaTest, ConvWithStridesPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(35); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithStridesPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithStridesPadding::Session session("ConvWithStridesPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithStridesPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_stridesPad = sizeof(ConvWithStridesPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_stridesPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +TEST_F(SofieAlpakaTest, ConvWithStridesNoPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(35); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithStridesNoPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithStridesNoPadding::Session session("ConvWithStridesNoPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithStridesNoPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_stridesNoPad = sizeof(ConvWithStridesNoPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_stridesNoPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +// Disables test (asymmetric padding not supported) +TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(35); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithAsymmetricPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithAsymmetricPadding::Session session("ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithAsymmetricPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_asymPad = sizeof(ConvWithAsymmetricPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_asymPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + +TEST_F(SofieAlpakaTest, BatchNormalization) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = { + 1.f, 2.f, 3.f, 4.f, // channel 0 + 5.f, 6.f, 7.f, 8.f // channel 1 + }; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_BatchNorm::Session session("BatchNorm_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + float inv_std = 1.f / std::sqrt(1.f + 1e-5f); + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - input[i] * inv_std), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, BatchNormalizationRelu) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = { + -1.f, 2.f, -3.f, 4.f, + 5.f, -6.f, 7.f, -8.f + }; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_BatchNormRelu::Session session("BatchNormRelu_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + float inv_std = 1.f / std::sqrt(1.f + 1e-5f); + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) { + float expected = std::max(0.f, input[i] * inv_std); + EXPECT_LE(std::abs(res_ptr[i] - expected), TOLERANCE) << "i=" << i; + } +} + +TEST_F(SofieAlpakaTest, LayerNorm) +{ + constexpr float TOLERANCE = 1e-4f; + std::vector input = {1.f, 2.f, 3.f, 4.f, + 5.f, 6.f, 7.f, 8.f}; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LayerNorm::Session session("LayerNorm_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + // Row 0: mean=2.5, std=sqrt(1.25+1e-5) ≈ 1.118034 + // Row 1: mean=6.5, std=sqrt(1.25+1e-5) ≈ 1.118034 + // Y[0] = (1-2.5)/1.118034 ≈ -1.3416 + // Y[1] = (2-2.5)/1.118034 ≈ -0.4472 + // Y[2] = (3-2.5)/1.118034 ≈ 0.4472 + // Y[3] = (4-2.5)/1.118034 ≈ 1.3416 + float inv_std = 1.f / std::sqrt(1.25f + 1e-5f); + std::vector expected = { + (1.f - 2.5f) * inv_std, (2.f - 2.5f) * inv_std, + (3.f - 2.5f) * inv_std, (4.f - 2.5f) * inv_std, + (5.f - 6.5f) * inv_std, (6.f - 6.5f) * inv_std, + (7.f - 6.5f) * inv_std, (8.f - 6.5f) * inv_std + }; + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, LayerNormScaleBias) +{ + constexpr float TOLERANCE = 1e-4f; + + std::vector input = {1.f, 2.f, 3.f, 4.f, + 5.f, 6.f, 7.f, 8.f}; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LayerNormScaleBias::Session session("LayerNormScaleBias_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + float inv_std = 1.f / std::sqrt(1.25f + 1e-5f); + std::vector expected = { + 2.f * (1.f - 2.5f) * inv_std + 1.f, 2.f * (2.f - 2.5f) * inv_std + 1.f, + 2.f * (3.f - 2.5f) * inv_std + 1.f, 2.f * (4.f - 2.5f) * inv_std + 1.f, + 2.f * (5.f - 6.5f) * inv_std + 1.f, 2.f * (6.f - 6.5f) * inv_std + 1.f, + 2.f * (7.f - 6.5f) * inv_std + 1.f, 2.f * (8.f - 6.5f) * inv_std + 1.f + }; + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, LayerNorm3D) +{ + constexpr float TOLERANCE = 1e-4f; + + std::vector input(24); + std::iota(input.begin(), input.end(), 0.f); // 0..23 + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LayerNorm3D::Session session("LayerNorm3D_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + auto compute_expected = [](std::vector row) { + float mean = 0.f; + for (float v : row) mean += v; + mean /= row.size(); + float var = 0.f; + for (float v : row) var += (v - mean) * (v - mean); + var /= row.size(); + float inv_std = 1.f / std::sqrt(var + 1e-5f); + std::vector out; + for (float v : row) out.push_back((v - mean) * inv_std); + return out; + }; + + std::vector row0(input.begin(), input.begin() + 12); + std::vector row1(input.begin() + 12, input.end()); + auto exp0 = compute_expected(row0); + auto exp1 = compute_expected(row1); + + ASSERT_EQ(outputSize, 24u); + for (size_t i = 0; i < 12; ++i) + EXPECT_LE(std::abs(res_ptr[i] - exp0[i]), TOLERANCE) << "row0 i=" << i; + for (size_t i = 0; i < 12; ++i) + EXPECT_LE(std::abs(res_ptr[12 + i] - exp1[i]), TOLERANCE) << "row1 i=" << i; +} + +TEST_F(SofieAlpakaTest, IsInf) +{ + // Input contains finite values, +inf, -inf; output is bool (uint8_t). + float pos_inf = std::numeric_limits::infinity(); + float neg_inf = -std::numeric_limits::infinity(); + std::vector input = {1.0f, pos_inf, neg_inf, 0.0f, -1.0f, 2.0f, neg_inf, pos_inf}; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_IsInf::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 8u); + for (size_t i = 0; i < N; ++i) + EXPECT_EQ(static_cast(res_ptr[i]), std::isinf(input[i])) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, IsNaN) +{ + // Input contains finite values, +inf, and NaN; output is bool (uint8_t). + float nan_val = std::numeric_limits::quiet_NaN(); + float pos_inf = std::numeric_limits::infinity(); + std::vector input = {1.0f, nan_val, 0.0f, pos_inf, nan_val, 2.0f, -1.0f, nan_val}; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_IsNaN::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 8u); + for (size_t i = 0; i < N; ++i) + EXPECT_EQ(static_cast(res_ptr[i]), std::isnan(input[i])) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Clip) +{ + // Model clips to [-1.0, 1.0]. + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + constexpr float clip_min = -1.0f; + constexpr float clip_max = 1.0f; + + std::vector input = { + -2.0f, -1.5f, -1.0f, -0.5f, + 0.0f, 0.5f, 1.0f, 1.5f, + 2.0f, -0.3f, 0.7f, 1.2f + }; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_Clip::Session session("Clip_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 12u); + for (size_t i = 0; i < N; ++i) { + float expected = std::max(clip_min, std::min(clip_max, input[i])); + EXPECT_LE(std::abs(res_ptr[i] - expected), TOLERANCE) << "i=" << i; + } +} + +TEST_F(SofieAlpakaTest, Not) +{ + // Input and output are bool tensors (uint8_t on device). + std::vector input = {1, 0, 1, 1, 0, 0, 1, 0}; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + uint8_t* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_Not::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 8u); + for (size_t i = 0; i < N; ++i) + EXPECT_EQ(static_cast(res_ptr[i]), !static_cast(input[i])) << "i=" << i; +} + +// GNN model: 3370 nodes (29 features each), 24126 edges (5 features each), +// edge_index shape [2, 24126]. Output: sigmoid score per edge in [0, 1]. +TEST_F(SofieAlpakaTest, GNN_model) +{ + // ---- sizes ------------------------------------------------------- + constexpr Idx N_x = 97730; // 3370 nodes × 29 features + constexpr Idx N_ef = 120630; // 24126 edges × 5 features + constexpr Idx N_ei = 48252; // 2 rows × 24126 edges (int64) + constexpr Idx N_out = 24126; // one sigmoid score per edge + + // ---- host buffers ------------------------------------------------- + auto x_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_x})); + auto ef_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_ef})); + auto ei_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_ei})); + + float* x_ptr = reinterpret_cast (alpaka::getPtrNative(x_h)); + float* ef_ptr = reinterpret_cast (alpaka::getPtrNative(ef_h)); + int64_t* ei_ptr = reinterpret_cast(alpaka::getPtrNative(ei_h)); + + for (Idx i = 0; i < N_x; ++i) x_ptr[i] = 0.5f; + for (Idx i = 0; i < N_ef; ++i) ef_ptr[i] = 0.5f; + for (Idx i = 0; i < N_ei; ++i) ei_ptr[i] = 0; // all self-loops on node 0 + + // ---- device buffers ----------------------------------------------- + auto x_d = alpaka::allocBuf(device, Ext1D::all(Idx{N_x})); + auto ef_d = alpaka::allocBuf(device, Ext1D::all(Idx{N_ef})); + auto ei_d = alpaka::allocBuf(device, Ext1D::all(Idx{N_ei})); + + alpaka::memcpy(queue, x_d, x_h); + alpaka::memcpy(queue, ef_d, ef_h); + alpaka::memcpy(queue, ei_d, ei_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_out})); + + { + SOFIE_GNN_model::Session session("GNN_model_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(x_d, ef_d, ei_d); + alpaka::wait(session.queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N_out, 24126u); + for (Idx i = 0; i < N_out; ++i) { + EXPECT_GE(res_ptr[i], 0.0f) << "output[" << i << "] < 0"; + EXPECT_LE(res_ptr[i], 1.0f) << "output[" << i << "] > 1"; + } +} diff --git a/src/SOFIE_core/test/TestCustomModelsFromROOT.cxx b/core/test/TestCustomModelsFromROOT.cxx similarity index 100% rename from src/SOFIE_core/test/TestCustomModelsFromROOT.cxx rename to core/test/TestCustomModelsFromROOT.cxx diff --git a/src/SOFIE_core/test/TestSofieModels.cxx b/core/test/TestSofieModels.cxx similarity index 100% rename from src/SOFIE_core/test/TestSofieModels.cxx rename to core/test/TestSofieModels.cxx diff --git a/src/SOFIE_core/test/input_models/Abs.onnx b/core/test/input_models/Abs.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Abs.onnx rename to core/test/input_models/Abs.onnx diff --git a/src/SOFIE_core/test/input_models/Add.onnx b/core/test/input_models/Add.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Add.onnx rename to core/test/input_models/Add.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast1.onnx b/core/test/input_models/AddBroadcast1.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast1.onnx rename to core/test/input_models/AddBroadcast1.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast2.onnx b/core/test/input_models/AddBroadcast2.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast2.onnx rename to core/test/input_models/AddBroadcast2.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast3.onnx b/core/test/input_models/AddBroadcast3.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast3.onnx rename to core/test/input_models/AddBroadcast3.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast4.onnx b/core/test/input_models/AddBroadcast4.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast4.onnx rename to core/test/input_models/AddBroadcast4.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast5.onnx b/core/test/input_models/AddBroadcast5.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast5.onnx rename to core/test/input_models/AddBroadcast5.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast6.onnx b/core/test/input_models/AddBroadcast6.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast6.onnx rename to core/test/input_models/AddBroadcast6.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast7.onnx b/core/test/input_models/AddBroadcast7.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast7.onnx rename to core/test/input_models/AddBroadcast7.onnx diff --git a/src/SOFIE_core/test/input_models/AvgPool.onnx b/core/test/input_models/AvgPool.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AvgPool.onnx rename to core/test/input_models/AvgPool.onnx diff --git a/core/test/input_models/BatchNorm.onnx b/core/test/input_models/BatchNorm.onnx new file mode 100644 index 0000000..f03cd9a Binary files /dev/null and b/core/test/input_models/BatchNorm.onnx differ diff --git a/core/test/input_models/BatchNormRelu.onnx b/core/test/input_models/BatchNormRelu.onnx new file mode 100644 index 0000000..badf2c2 Binary files /dev/null and b/core/test/input_models/BatchNormRelu.onnx differ diff --git a/src/SOFIE_core/test/input_models/Cast.onnx b/core/test/input_models/Cast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Cast.onnx rename to core/test/input_models/Cast.onnx diff --git a/core/test/input_models/Clip.onnx b/core/test/input_models/Clip.onnx new file mode 100644 index 0000000..a91d748 Binary files /dev/null and b/core/test/input_models/Clip.onnx differ diff --git a/src/SOFIE_core/test/input_models/ComplexTopK.onnx b/core/test/input_models/ComplexTopK.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ComplexTopK.onnx rename to core/test/input_models/ComplexTopK.onnx diff --git a/src/SOFIE_core/test/input_models/Concat_0D.onnx b/core/test/input_models/Concat_0D.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Concat_0D.onnx rename to core/test/input_models/Concat_0D.onnx diff --git a/src/SOFIE_core/test/input_models/Constant.onnx b/core/test/input_models/Constant.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Constant.onnx rename to core/test/input_models/Constant.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTranspose1d.onnx b/core/test/input_models/ConvTranspose1d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTranspose1d.onnx rename to core/test/input_models/ConvTranspose1d.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTranspose2d.onnx b/core/test/input_models/ConvTranspose2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTranspose2d.onnx rename to core/test/input_models/ConvTranspose2d.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTransposeBias2d.onnx b/core/test/input_models/ConvTransposeBias2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTransposeBias2d.onnx rename to core/test/input_models/ConvTransposeBias2d.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTransposeBias2dBatched.onnx b/core/test/input_models/ConvTransposeBias2dBatched.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTransposeBias2dBatched.onnx rename to core/test/input_models/ConvTransposeBias2dBatched.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithAsymmetricPadding.onnx b/core/test/input_models/ConvWithAsymmetricPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithAsymmetricPadding.onnx rename to core/test/input_models/ConvWithAsymmetricPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithAutopadSameLower.onnx b/core/test/input_models/ConvWithAutopadSameLower.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithAutopadSameLower.onnx rename to core/test/input_models/ConvWithAutopadSameLower.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithPadding.onnx b/core/test/input_models/ConvWithPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithPadding.onnx rename to core/test/input_models/ConvWithPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithStridesNoPadding.onnx b/core/test/input_models/ConvWithStridesNoPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithStridesNoPadding.onnx rename to core/test/input_models/ConvWithStridesNoPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithStridesPadding.onnx b/core/test/input_models/ConvWithStridesPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithStridesPadding.onnx rename to core/test/input_models/ConvWithStridesPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithoutPadding.onnx b/core/test/input_models/ConvWithoutPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithoutPadding.onnx rename to core/test/input_models/ConvWithoutPadding.onnx diff --git a/src/SOFIE_core/test/input_models/Cos.onnx b/core/test/input_models/Cos.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Cos.onnx rename to core/test/input_models/Cos.onnx diff --git a/src/SOFIE_core/test/input_models/Div.onnx b/core/test/input_models/Div.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Div.onnx rename to core/test/input_models/Div.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_3.onnx b/core/test/input_models/Einsum_3.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_3.onnx rename to core/test/input_models/Einsum_3.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_4.onnx b/core/test/input_models/Einsum_4.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_4.onnx rename to core/test/input_models/Einsum_4.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_dotprod.onnx b/core/test/input_models/Einsum_dotprod.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_dotprod.onnx rename to core/test/input_models/Einsum_dotprod.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_matmul.onnx b/core/test/input_models/Einsum_matmul.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_matmul.onnx rename to core/test/input_models/Einsum_matmul.onnx diff --git a/src/SOFIE_core/test/input_models/Elu.onnx b/core/test/input_models/Elu.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Elu.onnx rename to core/test/input_models/Elu.onnx diff --git a/src/SOFIE_core/test/input_models/Equal.onnx b/core/test/input_models/Equal.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Equal.onnx rename to core/test/input_models/Equal.onnx diff --git a/src/SOFIE_core/test/input_models/Erf.onnx b/core/test/input_models/Erf.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Erf.onnx rename to core/test/input_models/Erf.onnx diff --git a/src/SOFIE_core/test/input_models/Exp.onnx b/core/test/input_models/Exp.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Exp.onnx rename to core/test/input_models/Exp.onnx diff --git a/src/SOFIE_core/test/input_models/ExpandDiffSize.onnx b/core/test/input_models/ExpandDiffSize.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ExpandDiffSize.onnx rename to core/test/input_models/ExpandDiffSize.onnx diff --git a/src/SOFIE_core/test/input_models/ExpandSameSize.onnx b/core/test/input_models/ExpandSameSize.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ExpandSameSize.onnx rename to core/test/input_models/ExpandSameSize.onnx diff --git a/src/SOFIE_core/test/input_models/EyeLike.onnx b/core/test/input_models/EyeLike.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/EyeLike.onnx rename to core/test/input_models/EyeLike.onnx diff --git a/core/test/input_models/GNN_model.onnx b/core/test/input_models/GNN_model.onnx new file mode 100644 index 0000000..833e34d Binary files /dev/null and b/core/test/input_models/GNN_model.onnx differ diff --git a/src/SOFIE_core/test/input_models/GRUBatchwise.onnx b/core/test/input_models/GRUBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUBatchwise.onnx rename to core/test/input_models/GRUBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/GRUBidirectional.onnx b/core/test/input_models/GRUBidirectional.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUBidirectional.onnx rename to core/test/input_models/GRUBidirectional.onnx diff --git a/src/SOFIE_core/test/input_models/GRUDefaults.onnx b/core/test/input_models/GRUDefaults.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUDefaults.onnx rename to core/test/input_models/GRUDefaults.onnx diff --git a/src/SOFIE_core/test/input_models/GRUInitialBias.onnx b/core/test/input_models/GRUInitialBias.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUInitialBias.onnx rename to core/test/input_models/GRUInitialBias.onnx diff --git a/src/SOFIE_core/test/input_models/GRUSeqLength.onnx b/core/test/input_models/GRUSeqLength.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUSeqLength.onnx rename to core/test/input_models/GRUSeqLength.onnx diff --git a/src/SOFIE_core/test/input_models/Gather2d.onnx b/core/test/input_models/Gather2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Gather2d.onnx rename to core/test/input_models/Gather2d.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis0.onnx b/core/test/input_models/GatherAxis0.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis0.onnx rename to core/test/input_models/GatherAxis0.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis1.onnx b/core/test/input_models/GatherAxis1.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis1.onnx rename to core/test/input_models/GatherAxis1.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis2.onnx b/core/test/input_models/GatherAxis2.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis2.onnx rename to core/test/input_models/GatherAxis2.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis3.onnx b/core/test/input_models/GatherAxis3.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis3.onnx rename to core/test/input_models/GatherAxis3.onnx diff --git a/core/test/input_models/GatherND_Batch.onnx b/core/test/input_models/GatherND_Batch.onnx new file mode 100644 index 0000000..4d146c6 Binary files /dev/null and b/core/test/input_models/GatherND_Batch.onnx differ diff --git a/core/test/input_models/GatherND_Ex1.onnx b/core/test/input_models/GatherND_Ex1.onnx new file mode 100644 index 0000000..bc1a910 Binary files /dev/null and b/core/test/input_models/GatherND_Ex1.onnx differ diff --git a/core/test/input_models/GatherND_Ex2.onnx b/core/test/input_models/GatherND_Ex2.onnx new file mode 100644 index 0000000..4cd511c Binary files /dev/null and b/core/test/input_models/GatherND_Ex2.onnx differ diff --git a/core/test/input_models/GatherND_Ex3.onnx b/core/test/input_models/GatherND_Ex3.onnx new file mode 100644 index 0000000..917008f Binary files /dev/null and b/core/test/input_models/GatherND_Ex3.onnx differ diff --git a/core/test/input_models/GatherND_Ex4.onnx b/core/test/input_models/GatherND_Ex4.onnx new file mode 100644 index 0000000..d3006a2 Binary files /dev/null and b/core/test/input_models/GatherND_Ex4.onnx differ diff --git a/core/test/input_models/GatherND_Ex5.onnx b/core/test/input_models/GatherND_Ex5.onnx new file mode 100644 index 0000000..be1ba0d Binary files /dev/null and b/core/test/input_models/GatherND_Ex5.onnx differ diff --git a/core/test/input_models/GatherND_NegativeIndices.onnx b/core/test/input_models/GatherND_NegativeIndices.onnx new file mode 100644 index 0000000..5fa05aa Binary files /dev/null and b/core/test/input_models/GatherND_NegativeIndices.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherNegativeIndices.onnx b/core/test/input_models/GatherNegativeIndices.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherNegativeIndices.onnx rename to core/test/input_models/GatherNegativeIndices.onnx diff --git a/src/SOFIE_core/test/input_models/Greater.onnx b/core/test/input_models/Greater.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Greater.onnx rename to core/test/input_models/Greater.onnx diff --git a/src/SOFIE_core/test/input_models/GreaterOrEqual.onnx b/core/test/input_models/GreaterOrEqual.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GreaterOrEqual.onnx rename to core/test/input_models/GreaterOrEqual.onnx diff --git a/core/test/input_models/IsInf.onnx b/core/test/input_models/IsInf.onnx new file mode 100644 index 0000000..b47fe82 Binary files /dev/null and b/core/test/input_models/IsInf.onnx differ diff --git a/core/test/input_models/IsNaN.onnx b/core/test/input_models/IsNaN.onnx new file mode 100644 index 0000000..d1a6e05 Binary files /dev/null and b/core/test/input_models/IsNaN.onnx differ diff --git a/src/SOFIE_core/test/input_models/LSTMBatchwise.onnx b/core/test/input_models/LSTMBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMBatchwise.onnx rename to core/test/input_models/LSTMBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMBidirectional.onnx b/core/test/input_models/LSTMBidirectional.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMBidirectional.onnx rename to core/test/input_models/LSTMBidirectional.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMDefaults.onnx b/core/test/input_models/LSTMDefaults.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMDefaults.onnx rename to core/test/input_models/LSTMDefaults.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMInitialBias.onnx b/core/test/input_models/LSTMInitialBias.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMInitialBias.onnx rename to core/test/input_models/LSTMInitialBias.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMPeepholes.onnx b/core/test/input_models/LSTMPeepholes.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMPeepholes.onnx rename to core/test/input_models/LSTMPeepholes.onnx diff --git a/core/test/input_models/LayerNorm.onnx b/core/test/input_models/LayerNorm.onnx new file mode 100644 index 0000000..97142e7 Binary files /dev/null and b/core/test/input_models/LayerNorm.onnx differ diff --git a/core/test/input_models/LayerNorm3D.onnx b/core/test/input_models/LayerNorm3D.onnx new file mode 100644 index 0000000..c29afc0 Binary files /dev/null and b/core/test/input_models/LayerNorm3D.onnx differ diff --git a/core/test/input_models/LayerNormScaleBias.onnx b/core/test/input_models/LayerNormScaleBias.onnx new file mode 100644 index 0000000..99ea540 Binary files /dev/null and b/core/test/input_models/LayerNormScaleBias.onnx differ diff --git a/src/SOFIE_core/test/input_models/LayerNormalization2d.onnx b/core/test/input_models/LayerNormalization2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LayerNormalization2d.onnx rename to core/test/input_models/LayerNormalization2d.onnx diff --git a/src/SOFIE_core/test/input_models/LayerNormalization4d.onnx b/core/test/input_models/LayerNormalization4d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LayerNormalization4d.onnx rename to core/test/input_models/LayerNormalization4d.onnx diff --git a/src/SOFIE_core/test/input_models/Less.onnx b/core/test/input_models/Less.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Less.onnx rename to core/test/input_models/Less.onnx diff --git a/src/SOFIE_core/test/input_models/LessOrEqual.onnx b/core/test/input_models/LessOrEqual.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LessOrEqual.onnx rename to core/test/input_models/LessOrEqual.onnx diff --git a/src/SOFIE_core/test/input_models/LinearWithLeakyRelu.onnx b/core/test/input_models/LinearWithLeakyRelu.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LinearWithLeakyRelu.onnx rename to core/test/input_models/LinearWithLeakyRelu.onnx diff --git a/src/SOFIE_core/test/input_models/LinearWithSelu.onnx b/core/test/input_models/LinearWithSelu.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LinearWithSelu.onnx rename to core/test/input_models/LinearWithSelu.onnx diff --git a/src/SOFIE_core/test/input_models/LinearWithSigmoid.onnx b/core/test/input_models/LinearWithSigmoid.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LinearWithSigmoid.onnx rename to core/test/input_models/LinearWithSigmoid.onnx diff --git a/src/SOFIE_core/test/input_models/Linear_16.onnx b/core/test/input_models/Linear_16.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Linear_16.onnx rename to core/test/input_models/Linear_16.onnx diff --git a/src/SOFIE_core/test/input_models/Linear_32.onnx b/core/test/input_models/Linear_32.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Linear_32.onnx rename to core/test/input_models/Linear_32.onnx diff --git a/src/SOFIE_core/test/input_models/Linear_64.onnx b/core/test/input_models/Linear_64.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Linear_64.onnx rename to core/test/input_models/Linear_64.onnx diff --git a/src/SOFIE_core/test/input_models/Log.onnx b/core/test/input_models/Log.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Log.onnx rename to core/test/input_models/Log.onnx diff --git a/src/SOFIE_core/test/input_models/Max.onnx b/core/test/input_models/Max.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Max.onnx rename to core/test/input_models/Max.onnx diff --git a/src/SOFIE_core/test/input_models/MaxMultidirectionalBroadcast.onnx b/core/test/input_models/MaxMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxMultidirectionalBroadcast.onnx rename to core/test/input_models/MaxMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/MaxPool1d.onnx b/core/test/input_models/MaxPool1d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxPool1d.onnx rename to core/test/input_models/MaxPool1d.onnx diff --git a/src/SOFIE_core/test/input_models/MaxPool2d.onnx b/core/test/input_models/MaxPool2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxPool2d.onnx rename to core/test/input_models/MaxPool2d.onnx diff --git a/src/SOFIE_core/test/input_models/MaxPool3d.onnx b/core/test/input_models/MaxPool3d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxPool3d.onnx rename to core/test/input_models/MaxPool3d.onnx diff --git a/src/SOFIE_core/test/input_models/MeanMultidirectionalBroadcast.onnx b/core/test/input_models/MeanMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MeanMultidirectionalBroadcast.onnx rename to core/test/input_models/MeanMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/MinMultidirectionalBroadcast.onnx b/core/test/input_models/MinMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MinMultidirectionalBroadcast.onnx rename to core/test/input_models/MinMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/Mul.onnx b/core/test/input_models/Mul.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Mul.onnx rename to core/test/input_models/Mul.onnx diff --git a/src/SOFIE_core/test/input_models/Neg.onnx b/core/test/input_models/Neg.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Neg.onnx rename to core/test/input_models/Neg.onnx diff --git a/core/test/input_models/Not.onnx b/core/test/input_models/Not.onnx new file mode 100644 index 0000000..b29ca99 Binary files /dev/null and b/core/test/input_models/Not.onnx differ diff --git a/src/SOFIE_core/test/input_models/Pad.onnx b/core/test/input_models/Pad.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Pad.onnx rename to core/test/input_models/Pad.onnx diff --git a/src/SOFIE_core/test/input_models/Pow.onnx b/core/test/input_models/Pow.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Pow.onnx rename to core/test/input_models/Pow.onnx diff --git a/src/SOFIE_core/test/input_models/Pow_broadcast.onnx b/core/test/input_models/Pow_broadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Pow_broadcast.onnx rename to core/test/input_models/Pow_broadcast.onnx diff --git a/src/SOFIE_core/test/input_models/RNNBatchwise.onnx b/core/test/input_models/RNNBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNBatchwise.onnx rename to core/test/input_models/RNNBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/RNNBidirectional.onnx b/core/test/input_models/RNNBidirectional.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNBidirectional.onnx rename to core/test/input_models/RNNBidirectional.onnx diff --git a/src/SOFIE_core/test/input_models/RNNBidirectionalBatchwise.onnx b/core/test/input_models/RNNBidirectionalBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNBidirectionalBatchwise.onnx rename to core/test/input_models/RNNBidirectionalBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/RNNDefaults.onnx b/core/test/input_models/RNNDefaults.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNDefaults.onnx rename to core/test/input_models/RNNDefaults.onnx diff --git a/src/SOFIE_core/test/input_models/RNNSeqLength.onnx b/core/test/input_models/RNNSeqLength.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNSeqLength.onnx rename to core/test/input_models/RNNSeqLength.onnx diff --git a/src/SOFIE_core/test/input_models/RNNSequence.onnx b/core/test/input_models/RNNSequence.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNSequence.onnx rename to core/test/input_models/RNNSequence.onnx diff --git a/src/SOFIE_core/test/input_models/RNNSequenceBatchwise.onnx b/core/test/input_models/RNNSequenceBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNSequenceBatchwise.onnx rename to core/test/input_models/RNNSequenceBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/RandomNormal.onnx b/core/test/input_models/RandomNormal.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RandomNormal.onnx rename to core/test/input_models/RandomNormal.onnx diff --git a/src/SOFIE_core/test/input_models/RandomUniform.onnx b/core/test/input_models/RandomUniform.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RandomUniform.onnx rename to core/test/input_models/RandomUniform.onnx diff --git a/src/SOFIE_core/test/input_models/RangeFloat.onnx b/core/test/input_models/RangeFloat.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RangeFloat.onnx rename to core/test/input_models/RangeFloat.onnx diff --git a/src/SOFIE_core/test/input_models/RangeInt.onnx b/core/test/input_models/RangeInt.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RangeInt.onnx rename to core/test/input_models/RangeInt.onnx diff --git a/src/SOFIE_core/test/input_models/Reciprocal.onnx b/core/test/input_models/Reciprocal.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Reciprocal.onnx rename to core/test/input_models/Reciprocal.onnx diff --git a/src/SOFIE_core/test/input_models/ReduceMean.onnx b/core/test/input_models/ReduceMean.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceMean.onnx rename to core/test/input_models/ReduceMean.onnx diff --git a/src/SOFIE_core/test/input_models/ReduceProd.onnx b/core/test/input_models/ReduceProd.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceProd.onnx rename to core/test/input_models/ReduceProd.onnx diff --git a/src/SOFIE_core/test/input_models/ReduceSum.onnx b/core/test/input_models/ReduceSum.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceSum.onnx rename to core/test/input_models/ReduceSum.onnx diff --git a/src/SOFIE_core/test/input_models/ReduceSumSquare.onnx b/core/test/input_models/ReduceSumSquare.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceSumSquare.onnx rename to core/test/input_models/ReduceSumSquare.onnx diff --git a/src/SOFIE_core/test/input_models/ScatterElements.onnx b/core/test/input_models/ScatterElements.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ScatterElements.onnx rename to core/test/input_models/ScatterElements.onnx diff --git a/src/SOFIE_core/test/input_models/Shape.onnx b/core/test/input_models/Shape.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Shape.onnx rename to core/test/input_models/Shape.onnx diff --git a/src/SOFIE_core/test/input_models/Sin.onnx b/core/test/input_models/Sin.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Sin.onnx rename to core/test/input_models/Sin.onnx diff --git a/src/SOFIE_core/test/input_models/Slice.onnx b/core/test/input_models/Slice.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice.onnx rename to core/test/input_models/Slice.onnx diff --git a/src/SOFIE_core/test/input_models/Slice_Default_Axis.onnx b/core/test/input_models/Slice_Default_Axis.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice_Default_Axis.onnx rename to core/test/input_models/Slice_Default_Axis.onnx diff --git a/src/SOFIE_core/test/input_models/Slice_Default_Steps.onnx b/core/test/input_models/Slice_Default_Steps.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice_Default_Steps.onnx rename to core/test/input_models/Slice_Default_Steps.onnx diff --git a/src/SOFIE_core/test/input_models/Slice_Neg.onnx b/core/test/input_models/Slice_Neg.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice_Neg.onnx rename to core/test/input_models/Slice_Neg.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax1d.onnx b/core/test/input_models/Softmax1d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax1d.onnx rename to core/test/input_models/Softmax1d.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax2d.onnx b/core/test/input_models/Softmax2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax2d.onnx rename to core/test/input_models/Softmax2d.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax3d.onnx b/core/test/input_models/Softmax3d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax3d.onnx rename to core/test/input_models/Softmax3d.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax4d.onnx b/core/test/input_models/Softmax4d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax4d.onnx rename to core/test/input_models/Softmax4d.onnx diff --git a/core/test/input_models/Softplus.onnx b/core/test/input_models/Softplus.onnx new file mode 100644 index 0000000..2f6a69f --- /dev/null +++ b/core/test/input_models/Softplus.onnx @@ -0,0 +1,11 @@ +  onnx-example:S + +inputoutput"SoftplusAbsZ +input +  + +b +output +  + +B \ No newline at end of file diff --git a/src/SOFIE_core/test/input_models/Split_0.onnx b/core/test/input_models/Split_0.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Split_0.onnx rename to core/test/input_models/Split_0.onnx diff --git a/src/SOFIE_core/test/input_models/Split_1.onnx b/core/test/input_models/Split_1.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Split_1.onnx rename to core/test/input_models/Split_1.onnx diff --git a/src/SOFIE_core/test/input_models/Split_2.onnx b/core/test/input_models/Split_2.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Split_2.onnx rename to core/test/input_models/Split_2.onnx diff --git a/src/SOFIE_core/test/input_models/Sqrt.onnx b/core/test/input_models/Sqrt.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Sqrt.onnx rename to core/test/input_models/Sqrt.onnx diff --git a/src/SOFIE_core/test/input_models/Sub.onnx b/core/test/input_models/Sub.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Sub.onnx rename to core/test/input_models/Sub.onnx diff --git a/src/SOFIE_core/test/input_models/SumMultidirectionalBroadcast.onnx b/core/test/input_models/SumMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/SumMultidirectionalBroadcast.onnx rename to core/test/input_models/SumMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/Tanh.onnx b/core/test/input_models/Tanh.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Tanh.onnx rename to core/test/input_models/Tanh.onnx diff --git a/src/SOFIE_core/test/input_models/Tile5D.onnx b/core/test/input_models/Tile5D.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Tile5D.onnx rename to core/test/input_models/Tile5D.onnx diff --git a/src/SOFIE_core/test/input_models/TopK.onnx b/core/test/input_models/TopK.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/TopK.onnx rename to core/test/input_models/TopK.onnx diff --git a/core/test/input_models/Transpose.onnx b/core/test/input_models/Transpose.onnx new file mode 100644 index 0000000..0e08157 Binary files /dev/null and b/core/test/input_models/Transpose.onnx differ diff --git a/src/SOFIE_core/test/input_models/Where.onnx b/core/test/input_models/Where.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Where.onnx rename to core/test/input_models/Where.onnx diff --git a/src/SOFIE_core/test/input_models/references/Add.ref.hxx b/core/test/input_models/references/Add.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Add.ref.hxx rename to core/test/input_models/references/Add.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast1.ref.hxx b/core/test/input_models/references/AddBroadcast1.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast1.ref.hxx rename to core/test/input_models/references/AddBroadcast1.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast2.ref.hxx b/core/test/input_models/references/AddBroadcast2.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast2.ref.hxx rename to core/test/input_models/references/AddBroadcast2.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast3.ref.hxx b/core/test/input_models/references/AddBroadcast3.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast3.ref.hxx rename to core/test/input_models/references/AddBroadcast3.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast4.ref.hxx b/core/test/input_models/references/AddBroadcast4.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast4.ref.hxx rename to core/test/input_models/references/AddBroadcast4.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast5.ref.hxx b/core/test/input_models/references/AddBroadcast5.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast5.ref.hxx rename to core/test/input_models/references/AddBroadcast5.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast6.ref.hxx b/core/test/input_models/references/AddBroadcast6.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast6.ref.hxx rename to core/test/input_models/references/AddBroadcast6.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast7.ref.hxx b/core/test/input_models/references/AddBroadcast7.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast7.ref.hxx rename to core/test/input_models/references/AddBroadcast7.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AvgPool.ref.hxx b/core/test/input_models/references/AvgPool.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AvgPool.ref.hxx rename to core/test/input_models/references/AvgPool.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Cast.ref.hxx b/core/test/input_models/references/Cast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Cast.ref.hxx rename to core/test/input_models/references/Cast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ComplexTopK.ref.hxx b/core/test/input_models/references/ComplexTopK.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ComplexTopK.ref.hxx rename to core/test/input_models/references/ComplexTopK.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Constant.ref.hxx b/core/test/input_models/references/Constant.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Constant.ref.hxx rename to core/test/input_models/references/Constant.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose1d.ref.hxx b/core/test/input_models/references/ConvTranspose1d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTranspose1d.ref.hxx rename to core/test/input_models/references/ConvTranspose1d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose2d.ref.hxx b/core/test/input_models/references/ConvTranspose2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTranspose2d.ref.hxx rename to core/test/input_models/references/ConvTranspose2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose3d.ref.hxx b/core/test/input_models/references/ConvTranspose3d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTranspose3d.ref.hxx rename to core/test/input_models/references/ConvTranspose3d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTransposeBias2d.ref.hxx b/core/test/input_models/references/ConvTransposeBias2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTransposeBias2d.ref.hxx rename to core/test/input_models/references/ConvTransposeBias2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx b/core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx rename to core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx b/core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx rename to core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx b/core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx rename to core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithPadding.ref.hxx b/core/test/input_models/references/ConvWithPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithPadding.ref.hxx rename to core/test/input_models/references/ConvWithPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx b/core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx rename to core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithStridesPadding.ref.hxx b/core/test/input_models/references/ConvWithStridesPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithStridesPadding.ref.hxx rename to core/test/input_models/references/ConvWithStridesPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithoutPadding.ref.hxx b/core/test/input_models/references/ConvWithoutPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithoutPadding.ref.hxx rename to core/test/input_models/references/ConvWithoutPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Div.ref.hxx b/core/test/input_models/references/Div.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Div.ref.hxx rename to core/test/input_models/references/Div.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Elu.ref.hxx b/core/test/input_models/references/Elu.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Elu.ref.hxx rename to core/test/input_models/references/Elu.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Equal.ref.hxx b/core/test/input_models/references/Equal.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Equal.ref.hxx rename to core/test/input_models/references/Equal.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Erf.ref.hxx b/core/test/input_models/references/Erf.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Erf.ref.hxx rename to core/test/input_models/references/Erf.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Exp.ref.hxx b/core/test/input_models/references/Exp.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Exp.ref.hxx rename to core/test/input_models/references/Exp.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ExpandDiffSize.ref.hxx b/core/test/input_models/references/ExpandDiffSize.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ExpandDiffSize.ref.hxx rename to core/test/input_models/references/ExpandDiffSize.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ExpandSameSize.ref.hxx b/core/test/input_models/references/ExpandSameSize.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ExpandSameSize.ref.hxx rename to core/test/input_models/references/ExpandSameSize.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/EyeLike.ref.hxx b/core/test/input_models/references/EyeLike.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/EyeLike.ref.hxx rename to core/test/input_models/references/EyeLike.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUBatchwise.ref.hxx b/core/test/input_models/references/GRUBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUBatchwise.ref.hxx rename to core/test/input_models/references/GRUBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUBidirectional.ref.hxx b/core/test/input_models/references/GRUBidirectional.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUBidirectional.ref.hxx rename to core/test/input_models/references/GRUBidirectional.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUDefaults.ref.hxx b/core/test/input_models/references/GRUDefaults.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUDefaults.ref.hxx rename to core/test/input_models/references/GRUDefaults.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUInitialBias.ref.hxx b/core/test/input_models/references/GRUInitialBias.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUInitialBias.ref.hxx rename to core/test/input_models/references/GRUInitialBias.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUSeqLength.ref.hxx b/core/test/input_models/references/GRUSeqLength.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUSeqLength.ref.hxx rename to core/test/input_models/references/GRUSeqLength.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Gather2d.ref.hxx b/core/test/input_models/references/Gather2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Gather2d.ref.hxx rename to core/test/input_models/references/Gather2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis0.ref.hxx b/core/test/input_models/references/GatherAxis0.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis0.ref.hxx rename to core/test/input_models/references/GatherAxis0.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis1.ref.hxx b/core/test/input_models/references/GatherAxis1.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis1.ref.hxx rename to core/test/input_models/references/GatherAxis1.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis2.ref.hxx b/core/test/input_models/references/GatherAxis2.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis2.ref.hxx rename to core/test/input_models/references/GatherAxis2.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis3.ref.hxx b/core/test/input_models/references/GatherAxis3.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis3.ref.hxx rename to core/test/input_models/references/GatherAxis3.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherNegativeIndices.ref.hxx b/core/test/input_models/references/GatherNegativeIndices.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherNegativeIndices.ref.hxx rename to core/test/input_models/references/GatherNegativeIndices.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Greater.ref.hxx b/core/test/input_models/references/Greater.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Greater.ref.hxx rename to core/test/input_models/references/Greater.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GreaterOrEqual.ref.hxx b/core/test/input_models/references/GreaterOrEqual.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GreaterOrEqual.ref.hxx rename to core/test/input_models/references/GreaterOrEqual.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMBatchwise.ref.hxx b/core/test/input_models/references/LSTMBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMBatchwise.ref.hxx rename to core/test/input_models/references/LSTMBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMBidirectional.ref.hxx b/core/test/input_models/references/LSTMBidirectional.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMBidirectional.ref.hxx rename to core/test/input_models/references/LSTMBidirectional.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMDefaults.ref.hxx b/core/test/input_models/references/LSTMDefaults.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMDefaults.ref.hxx rename to core/test/input_models/references/LSTMDefaults.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMInitialBias.ref.hxx b/core/test/input_models/references/LSTMInitialBias.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMInitialBias.ref.hxx rename to core/test/input_models/references/LSTMInitialBias.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMPeepholes.ref.hxx b/core/test/input_models/references/LSTMPeepholes.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMPeepholes.ref.hxx rename to core/test/input_models/references/LSTMPeepholes.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LayerNormalization2d.hxx b/core/test/input_models/references/LayerNormalization2d.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LayerNormalization2d.hxx rename to core/test/input_models/references/LayerNormalization2d.hxx diff --git a/src/SOFIE_core/test/input_models/references/LayerNormalization4d.hxx b/core/test/input_models/references/LayerNormalization4d.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LayerNormalization4d.hxx rename to core/test/input_models/references/LayerNormalization4d.hxx diff --git a/src/SOFIE_core/test/input_models/references/Less.ref.hxx b/core/test/input_models/references/Less.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Less.ref.hxx rename to core/test/input_models/references/Less.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LessOrEqual.ref.hxx b/core/test/input_models/references/LessOrEqual.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LessOrEqual.ref.hxx rename to core/test/input_models/references/LessOrEqual.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LinearWithLeakyRelu.ref.hxx b/core/test/input_models/references/LinearWithLeakyRelu.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LinearWithLeakyRelu.ref.hxx rename to core/test/input_models/references/LinearWithLeakyRelu.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LinearWithSelu.ref.hxx b/core/test/input_models/references/LinearWithSelu.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LinearWithSelu.ref.hxx rename to core/test/input_models/references/LinearWithSelu.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LinearWithSigmoid.ref.hxx b/core/test/input_models/references/LinearWithSigmoid.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LinearWithSigmoid.ref.hxx rename to core/test/input_models/references/LinearWithSigmoid.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Linear_16.ref.hxx b/core/test/input_models/references/Linear_16.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Linear_16.ref.hxx rename to core/test/input_models/references/Linear_16.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Linear_32.ref.hxx b/core/test/input_models/references/Linear_32.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Linear_32.ref.hxx rename to core/test/input_models/references/Linear_32.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Linear_64.ref.hxx b/core/test/input_models/references/Linear_64.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Linear_64.ref.hxx rename to core/test/input_models/references/Linear_64.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Log.ref.hxx b/core/test/input_models/references/Log.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Log.ref.hxx rename to core/test/input_models/references/Log.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Max.ref.hxx b/core/test/input_models/references/Max.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Max.ref.hxx rename to core/test/input_models/references/Max.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx b/core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx rename to core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxPool1d.ref.hxx b/core/test/input_models/references/MaxPool1d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxPool1d.ref.hxx rename to core/test/input_models/references/MaxPool1d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxPool2d.ref.hxx b/core/test/input_models/references/MaxPool2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxPool2d.ref.hxx rename to core/test/input_models/references/MaxPool2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxPool3d.ref.hxx b/core/test/input_models/references/MaxPool3d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxPool3d.ref.hxx rename to core/test/input_models/references/MaxPool3d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx b/core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx rename to core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx b/core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx rename to core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Mul.ref.hxx b/core/test/input_models/references/Mul.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Mul.ref.hxx rename to core/test/input_models/references/Mul.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Neg.ref.hxx b/core/test/input_models/references/Neg.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Neg.ref.hxx rename to core/test/input_models/references/Neg.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Pow.ref.hxx b/core/test/input_models/references/Pow.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Pow.ref.hxx rename to core/test/input_models/references/Pow.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Pow_broadcast.ref.hxx b/core/test/input_models/references/Pow_broadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Pow_broadcast.ref.hxx rename to core/test/input_models/references/Pow_broadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNBatchwise.ref.hxx b/core/test/input_models/references/RNNBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNBatchwise.ref.hxx rename to core/test/input_models/references/RNNBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNBidirectional.ref.hxx b/core/test/input_models/references/RNNBidirectional.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNBidirectional.ref.hxx rename to core/test/input_models/references/RNNBidirectional.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx b/core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx rename to core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNDefaults.ref.hxx b/core/test/input_models/references/RNNDefaults.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNDefaults.ref.hxx rename to core/test/input_models/references/RNNDefaults.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNSeqLength.ref.hxx b/core/test/input_models/references/RNNSeqLength.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNSeqLength.ref.hxx rename to core/test/input_models/references/RNNSeqLength.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNSequence.ref.hxx b/core/test/input_models/references/RNNSequence.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNSequence.ref.hxx rename to core/test/input_models/references/RNNSequence.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNSequenceBatchwise.ref.hxx b/core/test/input_models/references/RNNSequenceBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNSequenceBatchwise.ref.hxx rename to core/test/input_models/references/RNNSequenceBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RangeFloat.ref.hxx b/core/test/input_models/references/RangeFloat.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RangeFloat.ref.hxx rename to core/test/input_models/references/RangeFloat.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RangeInt.ref.hxx b/core/test/input_models/references/RangeInt.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RangeInt.ref.hxx rename to core/test/input_models/references/RangeInt.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Reciprocal.ref.hxx b/core/test/input_models/references/Reciprocal.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Reciprocal.ref.hxx rename to core/test/input_models/references/Reciprocal.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ReduceMean.ref.hxx b/core/test/input_models/references/ReduceMean.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ReduceMean.ref.hxx rename to core/test/input_models/references/ReduceMean.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ReduceProd.ref.hxx b/core/test/input_models/references/ReduceProd.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ReduceProd.ref.hxx rename to core/test/input_models/references/ReduceProd.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Shape.ref.hxx b/core/test/input_models/references/Shape.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Shape.ref.hxx rename to core/test/input_models/references/Shape.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice.ref.hxx b/core/test/input_models/references/Slice.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice.ref.hxx rename to core/test/input_models/references/Slice.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice_Default_Axis.ref.hxx b/core/test/input_models/references/Slice_Default_Axis.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice_Default_Axis.ref.hxx rename to core/test/input_models/references/Slice_Default_Axis.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice_Default_Steps.ref.hxx b/core/test/input_models/references/Slice_Default_Steps.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice_Default_Steps.ref.hxx rename to core/test/input_models/references/Slice_Default_Steps.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice_Neg.ref.hxx b/core/test/input_models/references/Slice_Neg.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice_Neg.ref.hxx rename to core/test/input_models/references/Slice_Neg.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax1d.ref.hxx b/core/test/input_models/references/Softmax1d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax1d.ref.hxx rename to core/test/input_models/references/Softmax1d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax2d.ref.hxx b/core/test/input_models/references/Softmax2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax2d.ref.hxx rename to core/test/input_models/references/Softmax2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax3d.ref.hxx b/core/test/input_models/references/Softmax3d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax3d.ref.hxx rename to core/test/input_models/references/Softmax3d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax4d.ref.hxx b/core/test/input_models/references/Softmax4d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax4d.ref.hxx rename to core/test/input_models/references/Softmax4d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Sqrt.ref.hxx b/core/test/input_models/references/Sqrt.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Sqrt.ref.hxx rename to core/test/input_models/references/Sqrt.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Sub.ref.hxx b/core/test/input_models/references/Sub.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Sub.ref.hxx rename to core/test/input_models/references/Sub.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx b/core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx rename to core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Tanh.ref.hxx b/core/test/input_models/references/Tanh.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Tanh.ref.hxx rename to core/test/input_models/references/Tanh.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Tile5D.ref.hxx b/core/test/input_models/references/Tile5D.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Tile5D.ref.hxx rename to core/test/input_models/references/Tile5D.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/TopK.ref.hxx b/core/test/input_models/references/TopK.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/TopK.ref.hxx rename to core/test/input_models/references/TopK.ref.hxx diff --git a/src/SOFIE_parsers/CMakeLists.txt b/parsers/CMakeLists.txt similarity index 79% rename from src/SOFIE_parsers/CMakeLists.txt rename to parsers/CMakeLists.txt index 379b7d7..6173490 100644 --- a/src/SOFIE_parsers/CMakeLists.txt +++ b/parsers/CMakeLists.txt @@ -5,7 +5,7 @@ # For the list of contributors see $ROOTSYS/README/CREDITS. ############################################################################ -# CMakeLists.txt file for building TMVA SOFIE package +# CMakeLists.txt file for building SOFIE package ############################################################################ #Author: Sitong An, Lorenzo Moneta 10/03/2021 @@ -26,13 +26,15 @@ set(source_headers ) list(TRANSFORM source_headers PREPEND "inc/") target_include_directories(SOFIE_parsers - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/inc + PUBLIC + $ + $ ) set(sources_cxx src/RModelParser_ONNX.cxx src/ParseBasicUnary.cxx src/ParseBasicBinary.cxx + src/ParseBasicIs.cxx src/ParseBatchNormalization.cxx src/ParseCast.cxx src/ParseConcat.cxx @@ -61,6 +63,7 @@ set(sources_cxx src/ParseLayerNormalization.cxx src/ParseExpand.cxx src/ParseGather.cxx + src/ParseGatherND.cxx src/ParseElu.cxx src/ParseFuseConvAdd.cxx src/ParseFuseConvTransposeAdd.cxx @@ -79,6 +82,8 @@ set(sources_cxx src/ParseWhere.cxx src/ParseEinsum.cxx src/ParseRandom.cxx + src/ParseNot.cxx + src/ParseClip.cxx src/ParseScatterElements.cxx ${PROTO_SRCS} ${DEPENDENCIES} @@ -102,7 +107,21 @@ target_include_directories(SOFIE_parsers PUBLIC set_target_properties(SOFIE_parsers PROPERTIES POSITION_INDEPENDENT_CODE TRUE) +if(SOFIE_WITH_ROOT AND ROOT_FOUND) + ROOT_GENERATE_DICTIONARY(G__SOFIE_parsers ${sources_headers} + LINKDEF inc/LinkDef.h + MODULE SOFIE_parsers + OPTIONS --deep + ) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers.rootmap + DESTINATION lib) +endif() + install(TARGETS SOFIE_parsers - LIBRARY DESTINATION lib + EXPORT SOFIETargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} +) +install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) -install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" DESTINATION "include") diff --git a/src/SOFIE_parsers/inc/LinkDef.h b/parsers/inc/LinkDef.h similarity index 100% rename from src/SOFIE_parsers/inc/LinkDef.h rename to parsers/inc/LinkDef.h diff --git a/src/SOFIE_parsers/inc/SOFIE/RModelParser_ONNX.hxx b/parsers/inc/SOFIE/RModelParser_ONNX.hxx similarity index 100% rename from src/SOFIE_parsers/inc/SOFIE/RModelParser_ONNX.hxx rename to parsers/inc/SOFIE/RModelParser_ONNX.hxx diff --git a/src/SOFIE_parsers/onnx_proto3 b/parsers/onnx_proto3 similarity index 100% rename from src/SOFIE_parsers/onnx_proto3 rename to parsers/onnx_proto3 diff --git a/src/SOFIE_parsers/src/ParseBasicBinary.cxx b/parsers/src/ParseBasicBinary.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseBasicBinary.cxx rename to parsers/src/ParseBasicBinary.cxx diff --git a/parsers/src/ParseBasicIs.cxx b/parsers/src/ParseBasicIs.cxx new file mode 100644 index 0000000..a1abad4 --- /dev/null +++ b/parsers/src/ParseBasicIs.cxx @@ -0,0 +1,66 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Basic_Is.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +template +std::unique_ptr ParseBasicIs(RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) +{ + + std::string input_name = nodeproto.input(0); + if (!parser.IsRegisteredTensorType(input_name)) { + throw + std::runtime_error("SOFIE ONNX Parser " + IsOpTraits::Name() + " op has input tensor " + input_name + + " but its type is not yet registered"); + } + + // get attributes for the IsInf operator + int detect_negative = 1; + int detect_positive = 1; + for (int_t i = 0; i < nodeproto.attribute_size(); i++) { + std::string attribute_name = nodeproto.attribute(i).name(); + if (attribute_name == "detect_negative") + detect_negative = nodeproto.attribute(i).i(); + if (attribute_name == "detect_positive") + detect_positive = nodeproto.attribute(i).i(); + } + + if (detect_positive == 0 && detect_negative == 0) + throw std::runtime_error("SOFIE ONNX Parser IsInf op has invalide attributes"); + + + std::unique_ptr op; + std::string output_name = nodeproto.output(0); + + if (nodeproto.attribute_size() == 0 || (detect_negative == 1 && detect_positive == 1)) + op.reset(new ROperator_Basic_Is(input_name, output_name)); + else if (nodeproto.attribute_size() > 0) { + // case detect_negative or detective_positive are set + if (detect_negative == 0) + op.reset(new ROperator_Basic_Is(input_name, output_name)); + else if (detect_positive == 0) + op.reset(new ROperator_Basic_Is(input_name, output_name)); + } else + throw std::runtime_error("SOFIE ONNX Parser " + IsOpTraits::Name() + " operator - invalid attributes"); + + // Register the output type (is always BOOL) + if (!parser.IsRegisteredTensorType(output_name)) { + parser.RegisterTensorType(output_name, ETensorType::BOOL); + } + + return op; +}; + +// Parse IsNaN +ParserFuncSignature ParseIsNaN = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicIs(parser, nodeproto); +}; + +// Parse IsInf +ParserFuncSignature ParseIsInf = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicIs(parser, nodeproto); +}; + + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseBasicNary.cxx b/parsers/src/ParseBasicNary.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseBasicNary.cxx rename to parsers/src/ParseBasicNary.cxx diff --git a/src/SOFIE_parsers/src/ParseBasicUnary.cxx b/parsers/src/ParseBasicUnary.cxx similarity index 82% rename from src/SOFIE_parsers/src/ParseBasicUnary.cxx rename to parsers/src/ParseBasicUnary.cxx index 1470f26..40d0225 100644 --- a/src/SOFIE_parsers/src/ParseBasicUnary.cxx +++ b/parsers/src/ParseBasicUnary.cxx @@ -79,5 +79,20 @@ ParserFuncSignature ParseAbs = [](RModelParser_ONNX &parser, const onnx::NodePro return ParseBasicUnary(parser, nodeproto); }; +//Parse Softplus +ParserFuncSignature ParseSoftplus = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicUnary(parser, nodeproto); +}; + +//Parse Atan +ParserFuncSignature ParseAtan = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicUnary(parser, nodeproto); +}; + +//Parse Floor +ParserFuncSignature ParseFloor = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicUnary(parser, nodeproto); +}; + } // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseBatchNormalization.cxx b/parsers/src/ParseBatchNormalization.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseBatchNormalization.cxx rename to parsers/src/ParseBatchNormalization.cxx diff --git a/src/SOFIE_parsers/src/ParseCast.cxx b/parsers/src/ParseCast.cxx similarity index 78% rename from src/SOFIE_parsers/src/ParseCast.cxx rename to parsers/src/ParseCast.cxx index 7685421..a0993d4 100644 --- a/src/SOFIE_parsers/src/ParseCast.cxx +++ b/parsers/src/ParseCast.cxx @@ -13,20 +13,19 @@ ParserFuncSignature ParseCast = [](RModelParser_ONNX &parser, const onnx::NodePr } std::unique_ptr op; - std::string attr_type; + ETensorType attr_type; for (int_t i = 0; i < nodeproto.attribute_size(); i++) { std::string attribute_name = nodeproto.attribute(i).name(); if (attribute_name == "to") - attr_type = ConvertTypeToString(static_cast(nodeproto.attribute(i).i())); + attr_type = static_cast(nodeproto.attribute(i).i()); } std::string output_name = nodeproto.output(0); op.reset(new ROperator_Cast(attr_type, nodeproto.input(0), output_name)); if (!parser.IsRegisteredTensorType(output_name)) { - ETensorType output_type = ConvertStringToType(attr_type); - parser.RegisterTensorType(output_name, output_type); + parser.RegisterTensorType(output_name, attr_type); } return op; diff --git a/parsers/src/ParseClip.cxx b/parsers/src/ParseClip.cxx new file mode 100644 index 0000000..4424c76 --- /dev/null +++ b/parsers/src/ParseClip.cxx @@ -0,0 +1,46 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Clip.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +ParserFuncSignature ParseClip = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) +{ + ETensorType input_type = ETensorType::UNDEFINED; + + std::string input_name = nodeproto.input(0); + if (parser.IsRegisteredTensorType(input_name)) { + input_type = parser.GetTensorType(input_name); + } else { + throw std::runtime_error("SOFIE ONNX Parser Clip op has input tensor " + input_name + + " but its type is not yet registered"); + } + + std::string output_name = nodeproto.output(0); + + // ONNX opset 11+: min and max are optional tensor inputs (empty string when absent) + std::string min_name = (nodeproto.input_size() > 1 && !nodeproto.input(1).empty()) + ? nodeproto.input(1) : ""; + std::string max_name = (nodeproto.input_size() > 2 && !nodeproto.input(2).empty()) + ? nodeproto.input(2) : ""; + + std::unique_ptr op; + switch (input_type) { + case ETensorType::FLOAT: + op.reset(new ROperator_Clip(input_name, output_name, min_name, max_name)); + break; + case ETensorType::DOUBLE: + op.reset(new ROperator_Clip(input_name, output_name, min_name, max_name)); + break; + default: + throw std::runtime_error("SOFIE ONNX Parser Clip op does not yet support input type " + + std::to_string(static_cast(input_type))); + } + + if (!parser.IsRegisteredTensorType(output_name)) + parser.RegisterTensorType(output_name, input_type); + + return op; +}; + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseComparision.cxx b/parsers/src/ParseComparision.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseComparision.cxx rename to parsers/src/ParseComparision.cxx diff --git a/src/SOFIE_parsers/src/ParseConcat.cxx b/parsers/src/ParseConcat.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConcat.cxx rename to parsers/src/ParseConcat.cxx diff --git a/src/SOFIE_parsers/src/ParseConstant.cxx b/parsers/src/ParseConstant.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConstant.cxx rename to parsers/src/ParseConstant.cxx diff --git a/src/SOFIE_parsers/src/ParseConv.cxx b/parsers/src/ParseConv.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConv.cxx rename to parsers/src/ParseConv.cxx diff --git a/src/SOFIE_parsers/src/ParseConvTranspose.cxx b/parsers/src/ParseConvTranspose.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConvTranspose.cxx rename to parsers/src/ParseConvTranspose.cxx diff --git a/src/SOFIE_parsers/src/ParseEinsum.cxx b/parsers/src/ParseEinsum.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseEinsum.cxx rename to parsers/src/ParseEinsum.cxx diff --git a/src/SOFIE_parsers/src/ParseElu.cxx b/parsers/src/ParseElu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseElu.cxx rename to parsers/src/ParseElu.cxx diff --git a/src/SOFIE_parsers/src/ParseErf.cxx b/parsers/src/ParseErf.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseErf.cxx rename to parsers/src/ParseErf.cxx diff --git a/src/SOFIE_parsers/src/ParseExpand.cxx b/parsers/src/ParseExpand.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseExpand.cxx rename to parsers/src/ParseExpand.cxx diff --git a/src/SOFIE_parsers/src/ParseEyeLike.cxx b/parsers/src/ParseEyeLike.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseEyeLike.cxx rename to parsers/src/ParseEyeLike.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseBatchnormRelu.cxx b/parsers/src/ParseFuseBatchnormRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseBatchnormRelu.cxx rename to parsers/src/ParseFuseBatchnormRelu.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseConvAdd.cxx b/parsers/src/ParseFuseConvAdd.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseConvAdd.cxx rename to parsers/src/ParseFuseConvAdd.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseConvTransposeAdd.cxx b/parsers/src/ParseFuseConvTransposeAdd.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseConvTransposeAdd.cxx rename to parsers/src/ParseFuseConvTransposeAdd.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseGemmRelu.cxx b/parsers/src/ParseFuseGemmRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseGemmRelu.cxx rename to parsers/src/ParseFuseGemmRelu.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseMatMulAdd.cxx b/parsers/src/ParseFuseMatMulAdd.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseMatMulAdd.cxx rename to parsers/src/ParseFuseMatMulAdd.cxx diff --git a/src/SOFIE_parsers/src/ParseGRU.cxx b/parsers/src/ParseGRU.cxx similarity index 97% rename from src/SOFIE_parsers/src/ParseGRU.cxx rename to parsers/src/ParseGRU.cxx index ec2cddf..58ce983 100644 --- a/src/SOFIE_parsers/src/ParseGRU.cxx +++ b/parsers/src/ParseGRU.cxx @@ -46,7 +46,7 @@ ParserFuncSignature ParseGRU = [](RModelParser_ONNX &parser, const onnx::NodePro } else if (attribute_name == "linear_before_reset") { attr_linear_before_reset = nodeproto.attribute(i).i(); } else { - std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " + std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " << nodeproto.name() << " is not defined in ONNX IR and not applied!\n"; } } diff --git a/src/SOFIE_parsers/src/ParseGather.cxx b/parsers/src/ParseGather.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseGather.cxx rename to parsers/src/ParseGather.cxx diff --git a/parsers/src/ParseGatherND.cxx b/parsers/src/ParseGatherND.cxx new file mode 100644 index 0000000..57beb01 --- /dev/null +++ b/parsers/src/ParseGatherND.cxx @@ -0,0 +1,49 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_GatherND.hxx" +#include "onnx_proto3.pb.h" +#include + + +namespace SOFIE { + +ParserFuncSignature ParseGatherND = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + ETensorType input_type = ETensorType::UNDEFINED; + auto input_name = nodeproto.input(0); + if (parser.IsRegisteredTensorType(input_name)) { + input_type = parser.GetTensorType(input_name); + } else { + throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op has input tensor " + input_name + + " but its type is not yet registered"); + } + + auto indices_name = nodeproto.input(1); + if (parser.IsRegisteredTensorType(indices_name)) { + ETensorType indices_type = parser.GetTensorType(indices_name); + if (indices_type != ETensorType::INT64) { + throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op indices tensor must be INT64, got " + + indices_name); + } + } + + int64_t batch_dims = 0; + for (int i = 0; i < nodeproto.attribute_size(); ++i) { + const auto& attr = nodeproto.attribute(i); + if (attr.name() == "batch_dims") { + batch_dims = attr.i(); + break; + } + } + + std::string output_name = nodeproto.output(0); + + std::unique_ptr op( + new ROperator_GatherND(batch_dims, input_name, indices_name, output_name)); + + if (!parser.IsRegisteredTensorType(output_name)) { + parser.RegisterTensorType(output_name, input_type); + } + + return op; +}; + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseGemm.cxx b/parsers/src/ParseGemm.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseGemm.cxx rename to parsers/src/ParseGemm.cxx diff --git a/src/SOFIE_parsers/src/ParseIdentity.cxx b/parsers/src/ParseIdentity.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseIdentity.cxx rename to parsers/src/ParseIdentity.cxx diff --git a/src/SOFIE_parsers/src/ParseIf.cxx b/parsers/src/ParseIf.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseIf.cxx rename to parsers/src/ParseIf.cxx diff --git a/src/SOFIE_parsers/src/ParseLSTM.cxx b/parsers/src/ParseLSTM.cxx similarity index 97% rename from src/SOFIE_parsers/src/ParseLSTM.cxx rename to parsers/src/ParseLSTM.cxx index b9dc165..a95ee01 100644 --- a/src/SOFIE_parsers/src/ParseLSTM.cxx +++ b/parsers/src/ParseLSTM.cxx @@ -46,7 +46,7 @@ ParserFuncSignature ParseLSTM = [](RModelParser_ONNX &parser, const onnx::NodePr } else if (attribute_name == "layout") { attr_layout = nodeproto.attribute(i).i(); } else { - std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " + std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " << nodeproto.name() << " is not defined in ONNX IR and not applied!\n"; } } diff --git a/src/SOFIE_parsers/src/ParseLayerNormalization.cxx b/parsers/src/ParseLayerNormalization.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseLayerNormalization.cxx rename to parsers/src/ParseLayerNormalization.cxx diff --git a/src/SOFIE_parsers/src/ParseLeakyRelu.cxx b/parsers/src/ParseLeakyRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseLeakyRelu.cxx rename to parsers/src/ParseLeakyRelu.cxx diff --git a/src/SOFIE_parsers/src/ParseMatMul.cxx b/parsers/src/ParseMatMul.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseMatMul.cxx rename to parsers/src/ParseMatMul.cxx diff --git a/parsers/src/ParseNot.cxx b/parsers/src/ParseNot.cxx new file mode 100644 index 0000000..ca315eb --- /dev/null +++ b/parsers/src/ParseNot.cxx @@ -0,0 +1,38 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Not.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +ParserFuncSignature ParseNot = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) +{ + ETensorType input_type = ETensorType::UNDEFINED; + + if (nodeproto.input_size() != 1 || nodeproto.output_size() != 1) + std::runtime_error("TMVA::SOFIE ONNX Parser Not op has invalid input or output size "); + + std::string input_name = nodeproto.input(0); + + if (parser.IsRegisteredTensorType(input_name)) { + input_type = parser.GetTensorType(input_name); + if (input_type !=ETensorType::BOOL && input_type !=ETensorType::UINT8 ) + throw std::runtime_error("TMVA::SOFIE ONNX Parser Not op has invalid input type " + ConvertTypeToString(input_type)); + } else { + throw + std::runtime_error("TMVA::SOFIE ONNX Parser Not op has input tensor " + input_name + + " but its type is not yet registered"); + } + + std::string output_name = nodeproto.output(0); + std::unique_ptr op(new ROperator_Not(input_name, output_name)); + + // Infer the output type + if (!parser.IsRegisteredTensorType(output_name)) { + parser.RegisterTensorType(output_name, input_type); + } + + return op; +}; + + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParsePad.cxx b/parsers/src/ParsePad.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParsePad.cxx rename to parsers/src/ParsePad.cxx diff --git a/src/SOFIE_parsers/src/ParsePool.cxx b/parsers/src/ParsePool.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParsePool.cxx rename to parsers/src/ParsePool.cxx diff --git a/src/SOFIE_parsers/src/ParseRNN.cxx b/parsers/src/ParseRNN.cxx similarity index 96% rename from src/SOFIE_parsers/src/ParseRNN.cxx rename to parsers/src/ParseRNN.cxx index d75b577..2d20e15 100644 --- a/src/SOFIE_parsers/src/ParseRNN.cxx +++ b/parsers/src/ParseRNN.cxx @@ -43,7 +43,7 @@ ParserFuncSignature ParseRNN = [](RModelParser_ONNX &parser, const onnx::NodePro } else if (attribute_name == "layout") { attr_layout = nodeproto.attribute(i).i(); } else { - std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " + std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " << nodeproto.name() << " is not defined in ONNX IR and not applied!\n"; } } diff --git a/src/SOFIE_parsers/src/ParseRandom.cxx b/parsers/src/ParseRandom.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseRandom.cxx rename to parsers/src/ParseRandom.cxx diff --git a/src/SOFIE_parsers/src/ParseRange.cxx b/parsers/src/ParseRange.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseRange.cxx rename to parsers/src/ParseRange.cxx diff --git a/src/SOFIE_parsers/src/ParseReduce.cxx b/parsers/src/ParseReduce.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseReduce.cxx rename to parsers/src/ParseReduce.cxx diff --git a/src/SOFIE_parsers/src/ParseRelu.cxx b/parsers/src/ParseRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseRelu.cxx rename to parsers/src/ParseRelu.cxx diff --git a/src/SOFIE_parsers/src/ParseReshape.cxx b/parsers/src/ParseReshape.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseReshape.cxx rename to parsers/src/ParseReshape.cxx diff --git a/src/SOFIE_parsers/src/ParseScatterElements.cxx b/parsers/src/ParseScatterElements.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseScatterElements.cxx rename to parsers/src/ParseScatterElements.cxx diff --git a/src/SOFIE_parsers/src/ParseSelu.cxx b/parsers/src/ParseSelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSelu.cxx rename to parsers/src/ParseSelu.cxx diff --git a/src/SOFIE_parsers/src/ParseShape.cxx b/parsers/src/ParseShape.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseShape.cxx rename to parsers/src/ParseShape.cxx diff --git a/src/SOFIE_parsers/src/ParseSigmoid.cxx b/parsers/src/ParseSigmoid.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSigmoid.cxx rename to parsers/src/ParseSigmoid.cxx diff --git a/src/SOFIE_parsers/src/ParseSlice.cxx b/parsers/src/ParseSlice.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSlice.cxx rename to parsers/src/ParseSlice.cxx diff --git a/src/SOFIE_parsers/src/ParseSoftmax.cxx b/parsers/src/ParseSoftmax.cxx similarity index 91% rename from src/SOFIE_parsers/src/ParseSoftmax.cxx rename to parsers/src/ParseSoftmax.cxx index aea042e..19bd57a 100644 --- a/src/SOFIE_parsers/src/ParseSoftmax.cxx +++ b/parsers/src/ParseSoftmax.cxx @@ -24,7 +24,7 @@ ParserFuncSignature ParseSoftmax = [](RModelParser_ONNX &parser, const onnx::Nod attr_axis = nodeproto.attribute(0).i(); switch (input_type) { - case ETensorType::FLOAT: op.reset(new ROperator_Softmax(attr_axis, input_name, output_name)); break; + case ETensorType::FLOAT: op.reset(new ROperator_Softmax(attr_axis, input_name, output_name)); break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Softmax does not yet support input type " + std::to_string(static_cast(input_type))); diff --git a/src/SOFIE_parsers/src/ParseSplit.cxx b/parsers/src/ParseSplit.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSplit.cxx rename to parsers/src/ParseSplit.cxx diff --git a/src/SOFIE_parsers/src/ParseTanh.cxx b/parsers/src/ParseTanh.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseTanh.cxx rename to parsers/src/ParseTanh.cxx diff --git a/src/SOFIE_parsers/src/ParseTile.cxx b/parsers/src/ParseTile.cxx similarity index 93% rename from src/SOFIE_parsers/src/ParseTile.cxx rename to parsers/src/ParseTile.cxx index 20dbfb6..8b8c47f 100644 --- a/src/SOFIE_parsers/src/ParseTile.cxx +++ b/parsers/src/ParseTile.cxx @@ -29,6 +29,7 @@ ParserFuncSignature ParseTile = [](RModelParser_ONNX &parser, const onnx::NodePr switch (input_type) { case ETensorType::FLOAT: op.reset(new ROperator_Tile(repeat_name, input_name, output_name)); break; + case ETensorType::INT64: op.reset(new ROperator_Tile(repeat_name, input_name, output_name)); break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Tile does not yet support input type " + std::to_string(static_cast(input_type))); diff --git a/src/SOFIE_parsers/src/ParseTopK.cxx b/parsers/src/ParseTopK.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseTopK.cxx rename to parsers/src/ParseTopK.cxx diff --git a/src/SOFIE_parsers/src/ParseTranspose.cxx b/parsers/src/ParseTranspose.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseTranspose.cxx rename to parsers/src/ParseTranspose.cxx diff --git a/src/SOFIE_parsers/src/ParseWhere.cxx b/parsers/src/ParseWhere.cxx similarity index 80% rename from src/SOFIE_parsers/src/ParseWhere.cxx rename to parsers/src/ParseWhere.cxx index ea73cff..636c7e2 100644 --- a/src/SOFIE_parsers/src/ParseWhere.cxx +++ b/parsers/src/ParseWhere.cxx @@ -11,6 +11,10 @@ ParserFuncSignature ParseWhere = [](RModelParser_ONNX &parser, const onnx::NodeP throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has invalid input size"); } // condition boolean vector is input 0 + if (!parser.IsRegisteredTensorType(nodeproto.input(0))){ + throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has input tensor " + nodeproto.input(0) + + " but its type is not yet registered"); + } if (!parser.IsRegisteredTensorType(nodeproto.input(1))){ throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has input tensor " + nodeproto.input(1) + " but its type is not yet registered"); @@ -31,10 +35,10 @@ ParserFuncSignature ParseWhere = [](RModelParser_ONNX &parser, const onnx::NodeP switch (input_type) { case ETensorType::FLOAT: - op.reset(new ROperator_Where(nodeproto.input(1), nodeproto.input(2), nodeproto.input(0), output_name)); + op.reset(new ROperator_Where(nodeproto.input(0), nodeproto.input(1), nodeproto.input(2), output_name)); break; case ETensorType::INT64: - op.reset(new ROperator_Where(nodeproto.input(1), nodeproto.input(2), nodeproto.input(0), output_name)); + op.reset(new ROperator_Where(nodeproto.input(0), nodeproto.input(1), nodeproto.input(2), output_name)); break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Where Operator does not yet support input type " + diff --git a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx b/parsers/src/RModelParser_ONNX.cxx similarity index 93% rename from src/SOFIE_parsers/src/RModelParser_ONNX.cxx rename to parsers/src/RModelParser_ONNX.cxx index 68662ae..ddc7104 100644 --- a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx +++ b/parsers/src/RModelParser_ONNX.cxx @@ -1,4 +1,3 @@ -#include "Byteswap.h" #include "SOFIE/RModelParser_ONNX.hxx" #include "onnx_proto3.pb.h" @@ -9,6 +8,10 @@ #include #include #include +#include +#include +#include +#include #include "SOFIE/SOFIE_common.hxx" @@ -24,6 +27,10 @@ extern ParserFuncSignature ParseLog; extern ParserFuncSignature ParseSin; extern ParserFuncSignature ParseCos; extern ParserFuncSignature ParseAbs; +extern ParserFuncSignature ParseSoftplus; +extern ParserFuncSignature ParseAtan; +extern ParserFuncSignature ParseFloor; + // Binary operators extern ParserFuncSignature ParseAdd; extern ParserFuncSignature ParseSub; @@ -41,6 +48,11 @@ extern ParserFuncSignature ParseLess; extern ParserFuncSignature ParseLessEq; extern ParserFuncSignature ParseGreater; extern ParserFuncSignature ParseGreaterEq; +//Is Operators +extern ParserFuncSignature ParseIsInf; +extern ParserFuncSignature ParseIsNaN; +extern ParserFuncSignature ParseNot; +extern ParserFuncSignature ParseClip; // Reduce operators extern ParserFuncSignature ParseReduceMean; extern ParserFuncSignature ParseReduceSum; @@ -73,6 +85,7 @@ extern ParserFuncSignature ParseShape; extern ParserFuncSignature ParseMatMul; extern ParserFuncSignature ParseLayerNormalization; extern ParserFuncSignature ParseGather; +extern ParserFuncSignature ParseGatherND; extern ParserFuncSignature ParseErf; extern ParserFuncSignature ParseElu; extern ParserFuncSignature ParseEyeLike; @@ -132,18 +145,31 @@ struct ExtractDataFromTP { static_cast(data)); } }; +// Reverse the bytes of a trivially-copyable value (used on big-endian hosts). +// ONNX raw_data is always stored in little-endian order. +template +static T bswap_value(T value) noexcept { + static_assert(std::is_trivially_copyable_v); + std::array bytes; + std::memcpy(bytes.data(), &value, sizeof(T)); + std::reverse(bytes.begin(), bytes.end()); + T result; + std::memcpy(&result, bytes.data(), sizeof(T)); + return result; +} + template std::shared_ptr GetInitializedTensorData(onnx::TensorProto * tensorproto, size_t length) { + std::cout<<"Getting Initialized Tensor data for tensor " << tensorproto->name() << " of type " << tensorproto->data_type() << " and length " << length << std::endl; std::shared_ptr data(malloc(length * sizeof(T)), free); if (!tensorproto->raw_data().empty()) { -#ifdef R__BYTESWAP std::memcpy(data.get(), tensorproto->raw_data().c_str(), length * sizeof(T)); -#else - for (std::size_t k = 0; k < length; ++k) - (reinterpret_cast::value_type *>(data.get()))[k] = - RByteSwap::bswap((reinterpret_cast::value_type *>(tensorproto->raw_data().c_str()))[k]); -#endif + if constexpr (std::endian::native != std::endian::little) { + T *ptr = static_cast(data.get()); + for (std::size_t k = 0; k < length; ++k) + ptr[k] = bswap_value(ptr[k]); + } } else { ExtractDataFromTP::Copy(tensorproto, data.get()); } @@ -162,6 +188,10 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("Sin", ParseSin); RegisterOperator("Cos", ParseCos); RegisterOperator("Abs", ParseAbs); + RegisterOperator("Softplus", ParseSoftplus); + RegisterOperator("Atan", ParseAtan); + RegisterOperator("Floor", ParseFloor); + // Binary operators RegisterOperator("Add", ParseAdd); RegisterOperator("Sub", ParseSub); @@ -179,6 +209,11 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("LessOrEqual", ParseLessEq); RegisterOperator("Greater", ParseGreater); RegisterOperator("GreaterOrEqual", ParseGreaterEq); + // Is / Not operators + RegisterOperator("IsInf", ParseIsInf); + RegisterOperator("IsNaN", ParseIsNaN); + RegisterOperator("Not", ParseNot); + RegisterOperator("Clip", ParseClip); // Reduce operators RegisterOperator("ReduceMean", ParseReduceMean); RegisterOperator("ReduceSum", ParseReduceSum); @@ -217,6 +252,7 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("LayerNormalization", ParseLayerNormalization); RegisterOperator("Expand", ParseExpand); RegisterOperator("Gather", ParseGather); + RegisterOperator("GatherND", ParseGatherND); RegisterOperator("Erf", ParseErf); RegisterOperator("Elu", ParseElu); RegisterOperator("EyeLike", ParseEyeLike); @@ -584,6 +620,13 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & if (verbose) std::cout << "add INT64 initialized tensor " << input_name << " shape " << ConvertShapeToString(shape) << std::endl; rmodel.AddInitializedTensor(input_name, ETensorType::INT64, shape, data); allInitializedTensors[input_name] = i; + std::cout<<"Printing initialized values for tensor: "<(data.get()); + + for (size_t i = 0; i < fLength; ++i) { + std::cout << rawData[i] << " "; + } + std::cout << std::endl; break; } default: @@ -730,7 +773,7 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & std::cout << "\t" << i << " " << nodesOrder[i] << " parsing operator " << op_type << std::endl; } - std::unique_ptr op = ParseOperator(i, graph, nodesOrder, nodesChildren[i]); + std::unique_ptr op = ParseOperator(i, graph, nodesOrder, nodesChildren[nodesOrder[i]]); if (!op) { if (verbose) { std::cout << "\t\tskipping operator since it is fused with previous one" << std::endl; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt deleted file mode 100644 index c48e8d1..0000000 --- a/src/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 1995-2019, Rene Brun and Fons Rademakers. -# All rights reserved. -# -# For the licensing terms see $ROOTSYS/LICENSE. -# For the list of contributors see $ROOTSYS/README/CREDITS. - -set(sofie_legacy_eval_backend ON CACHE BOOL "" FORCE) - -add_subdirectory(SOFIE_core) -add_subdirectory(SOFIE_parsers) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator.hxx b/src/SOFIE_core/inc/SOFIE/ROperator.hxx deleted file mode 100644 index edbec58..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator.hxx +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef SOFIE_ROPERATOR -#define SOFIE_ROPERATOR - -#include -#include - -#include "SOFIE/SOFIE_common.hxx" -//#include "RModel.hxx" - - - - -namespace SOFIE{ - -class RModel; - -class ROperator{ - - -public: - virtual std::vector GetBlasRoutines() { return {}; } - virtual std::vector GetStdLibs() { return {}; } - virtual std::vector> ShapeInference(std::vector>) = 0; - virtual std::vector TypeInference(std::vector) = 0; - virtual void Initialize(RModel&) = 0; - virtual std::string Generate(std::string OpName) = 0; //expect unique opName for each operator within the same RModel - // generate initialization code for session constructor - virtual std::string GenerateInitCode() { return "";} - // generate some specific declaration code for Session - virtual std::string GenerateDeclCode() { return "";} - // generate session data members specific to operator - virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; } - virtual std::string Header() { return "";} - - //virtual void Forward_reference() = 0; - //virtual void Forward_blas() = 0; - virtual ~ROperator(){} - -protected: - - const std::string SP = " "; ///< space used to correctly indent the generated C++ code - bool fUseSession = false; ///< flag to identify if using the session class - bool fIsOutputConstant = false; ///< flag to identify if operator has a constant output (no need to generate code) - - mutable std::vector fInputTensorNames; - mutable std::vector fOutputTensorNames; - -public: - std::span GetOpInputTensors() const { - return fInputTensorNames; - } - - std::span GetOpOutputTensors() const { - return fOutputTensorNames; - } - -}; - - - -}//SOFIE - - -#endif //SOFIE_OPERATOR diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx deleted file mode 100644 index 127eaff..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx +++ /dev/null @@ -1,216 +0,0 @@ -#ifndef SOFIE_ROperator_BasicBinary -#define SOFIE_ROperator_BasicBinary - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - -namespace SOFIE{ - -enum EBasicBinaryOperator { Add, Sub, Mul, Div, Pow }; - -template -struct BinaryOperatorTrait {}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Add"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " + " + t2; } - static T Func(T t1, T t2) {return t1 + t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Sub"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " - " + t2; } - static T Func (T t1, T t2) { return t1 - t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Mul"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " * " + t2; } - static T Func (T t1, T t2) { return t1 * t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Div"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " / " + t2; } - static T Func (T t1, T t2) { return t1/t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Pow"; } - static std::string Op(const std::string & t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } - static T Func (T t1, T t2) { return std::pow(t1,t2);} -}; - -template -class ROperator_BasicBinary final : public ROperator{ -private: - - std::string fNA; - std::string fNB; - std::string fNBroadcastedA; - std::string fNBroadcastedB; - std::string fNY; - - std::vector fShapeA; - std::vector fShapeB; - std::vector fShapeY; - -public: - ROperator_BasicBinary(){} - ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY): - fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNA, fNB }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { - // assume now inputs have same shape (no broadcasting) - auto ret = std::vector>(1, input[0]); // return vector size 1 with first input - return ret; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNA)){ - throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNA + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNB + "is not found in model"); - } - fShapeA = model.GetTensorShape(fNA); - fShapeB = model.GetTensorShape(fNB); - bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB); - if (broadcast) { - // Y is the common shape of A and B - fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeA, fShapeB); - bool broadcastA = !UTILITY::AreSameShape(fShapeA, fShapeY); - bool broadcastB = !UTILITY::AreSameShape(fShapeB, fShapeY); - // Broadcast A to Y - if (broadcastA) { - fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; - if (model.IsInitializedTensor(fNA)) { - auto data = model.GetInitializedTensorData(fNA); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), - std::default_delete()); - // Update the data and the shape of A - model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); - fShapeA = fShapeY; - } else { - // Add an intermediate tensor for broadcasting A - model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY); - } - } - // Broadcast B to Y - if (broadcastB) { - fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; - if (model.IsInitializedTensor(fNB)) { - auto data = model.GetInitializedTensorData(fNB); - std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) << std::endl; - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), - std::default_delete()); - // do not update tensor B but add broadcasted one (since it can be input to some other operators) - std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeY), static_cast(broadcastedData.get())) << std::endl; - model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); - fShapeB = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY); - } - } - } else { - fShapeY = fShapeA; - } - // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB)) { - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) { - dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); - } - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a fil - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); - fIsOutputConstant = true; - if (model.Verbose()) - std::cout << "Binary op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - } - else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); - } - } - - std::string GenerateInitCode() override { - std::stringstream out; - return out.str(); - } - - std::string Generate(std::string OpName) override { - - if (fIsOutputConstant) return ""; - - OpName = "op_" + OpName; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Binary Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//------ " << BinaryOperatorTrait::Name() << "\n"; - size_t length = ConvertShapeToLength(fShapeY); - std::string typeName = TensorType::Name(); - // Broadcast A if it's uninitialized - // use broadcasting function where we pass an already allocated tensor to minimize memory allocations - if (fShapeA != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedA << ");\n"; - } - // Broadcast B if it's uninitialized - if (fShapeB != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedB << ");\n"; - } - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = " << BinaryOperatorTrait::Op( "tensor_" + nameA + "[id]" , "tensor_" + nameB + "[id]") << " ;\n"; - out << SP << "}\n"; - return out.str(); - } - - std::vector GetStdLibs() override { - if (Op == EBasicBinaryOperator::Pow) { - return { std::string("cmath") }; - } else { - return {}; - } - } -}; - -}//SOFIE - - -#endif //SOFIE_ROperator_BasicBinary diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx deleted file mode 100644 index 47c3d66..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Cast -#define SOFIE_ROPERATOR_Cast - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - - -class ROperator_Cast final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - std::string fAttrType = "float"; - -public: - ROperator_Cast(){} - ROperator_Cast(std::string attr_type,std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), - fAttrType(attr_type) { - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - //input must be a graph input, or already initialized intermediate tensor - if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE Cast Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - // shoud we add a check if the same type - auto inputType = model.GetTensorType(fNX); - if (model.IsInitializedTensor(fNX)) { - fIsOutputConstant = true; - auto inputData = model.GetInitializedTensorData(fNX); - if (ConvertStringToType(fAttrType) == ETensorType::INT64) { - model.AddConstantTensor(fNY, fShape, static_cast(inputData.get())); - model.SetNotWritableInitializedTensor(fNX); - } - else - fIsOutputConstant = false; - } - if (!fIsOutputConstant) - model.AddIntermediateTensor(fNY, ConvertStringToType(fAttrType), fShape); - if (model.Verbose()) { - std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY; - if (fIsOutputConstant) std::cout << " (constant) "; - std::cout << std::endl; - } - } - - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; - - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Cast called to Generate without being initialized first"); - } - std::stringstream out; - size_t length = ConvertShapeToLength(fShape); - - // out << SP << ETensorType << " " << OpName << "_attr = " << fattr << ";\n"; - out << "\n//------ CAST\n"; - // no generated code for constant outputs - if (fIsOutputConstant) return out.str(); - - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - - out << SP << SP << "tensor_" << fNY << "[id] = static_cast<"<< fAttrType << ">(tensor_" << fNX << "[id]);\n"; - - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_Cast diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx deleted file mode 100644 index 0d5e574..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx +++ /dev/null @@ -1,263 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Concat - #define SOFIE_ROPERATOR_Concat - - - #include "SOFIE/SOFIE_common.hxx" - #include "SOFIE/ROperator.hxx" - #include "SOFIE/RModel.hxx" - - #include - #include - #include - #include - #include - - namespace SOFIE{ - - class ROperator_Concat final : public ROperator - { - private: - int fAxis=0; - int fnewAxis=0; - std::vector fInputs; - std::string fOutput; - std::vectorfOutputShape; - std::vector> fInputShapes; - - public: - ROperator_Concat(){} - ROperator_Concat(std::vector inputs, int axis, int newAxis, std::string output): - fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) { - fInputs.reserve(inputs.size()); - for (auto & name : inputs) - fInputs.push_back(UTILITY::Clean_name(name)); - - fInputTensorNames.resize(fInputs.size()); - std::transform(fInputs.begin(), fInputs.end(), fInputTensorNames.begin(), - [](const std::string& s) -> std::string_view { return s; }); - fOutputTensorNames = { fOutput }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - // get shape of output given inputs. It is going to be called after initialized - std::vector> ShapeInference(std::vector> inputs) override { - std::vector> ret(1); - // treat negative axis case - if (fAxis<0) { - fAxis = inputs[0].size()+fAxis; - } - if (fAxis < 0 || fAxis >= (int) inputs[0].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); - - int concat_dim=0; - if(fnewAxis == 0){ - for (size_t i = 0; i < inputs.size(); i++) { - if (i > 0 && inputs[i].size() != inputs[i - 1].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + - ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i - 1])); - for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { - if ((int)iaxis == fAxis) - concat_dim += inputs[i][iaxis]; - else if (i > 0 && inputs[i][iaxis] != inputs[i - 1][iaxis]) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertShapeToString(inputs[i]) + " and " + - ConvertShapeToString(inputs[i - 1])); - } - } - - // output shape - ret[0] = inputs[0]; - ret[0][fAxis] = concat_dim; - } - std::vector stack; - if(fnewAxis == 1){ - for(size_t i = 0; i < inputs.size(); i++) { - if (i > 0 && inputs[i].size() != inputs[i-1].size() ) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + - ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i-1])); - for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { - if ((int) iaxis == fAxis) - stack.push_back(inputs[i][iaxis]); - else - if (i> 0 && inputs[i][iaxis] != inputs[i-1][iaxis]) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i-1])); - } - - } - for(auto it:stack) - ret[0].push_back(it); - } - - return ret; - } - - // get shape of output given inputs. It is going to be called after initialized - std::vector> ShapeInference(const std::vector> & inputs) { - std::vector> ret(1); - // treat negative axis case - if (fAxis<0) { - fAxis = inputs[0].size()+fAxis; - } - if (fAxis < 0 || fAxis >= (int) inputs[0].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); - - int concat_dim=0; - if(fnewAxis == 0){ - for (size_t i = 0; i < inputs.size(); i++) { - if (i > 0 && inputs[i].size() != inputs[i - 1].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + - ConvertDynamicShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDynamicShapeToString(inputs[i - 1])); - for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { - if ((int)iaxis == fAxis) { - // support only non-params shape for the concatenation axis - if (inputs[i][iaxis].isParam) - throw std::runtime_error("TMVA SOFIE Concat Op - not supporting input param dimensions for concatenation axis. Input shape is " + - ConvertDynamicShapeToString(inputs[i])); - concat_dim += inputs[i][iaxis].dim; - } - // other dimensions must be the same - else if (i > 0 && inputs[i][iaxis].GetVal() != inputs[i - 1][iaxis].GetVal()) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertDynamicShapeToString(inputs[i]) + " and " + - ConvertDynamicShapeToString(inputs[i - 1])); - } - } - - // output shape - ret[0] = inputs[0]; - ret[0][fAxis].dim = concat_dim; - } - // case of stacking (not supported yet) - // here we need to check that input shapes are the same - // for example for fAxis == 0 - // output shapes: [inputs.size(), inputs[0][0], inputs[0][1],....] - if(fnewAxis == 1){ - throw std::runtime_error("TMVA SOFIE Concat Op - stacking (i.e. COncatFromSequence with new_axis=1) is not supported "); - } - return ret; - } - - void Initialize(RModel& model) override { - for (auto &it : fInputs) { - if (model.CheckIfTensorAlreadyExist(it) == false) { - throw std::runtime_error("TMVA SOFIE Concat Op Input Tensor " + it + " is not found in model"); - } - fInputShapes.push_back(model.GetDynamicTensorShape(it)); - } - fOutputShape = ShapeInference(fInputShapes)[0]; - if (model.Verbose()) - std::cout << "Output of concat operator has shape " << ConvertDynamicShapeToString(fOutputShape) << std::endl; - - // check if concat has constant inputs , axis 0(concat contigous memory and type is integer) - if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) { - fIsOutputConstant = true; - for ( auto & input : fInputs) { - if (!model.IsInitializedTensor(input)) { - fIsOutputConstant = false; - break; - } - } - if (fIsOutputConstant) { - auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible - std::vector outputData(ConvertShapeToLength(outputShape)); - size_t offset = 0; - for ( auto & input : fInputs) { - auto inputData = static_cast(model.GetInitializedTensorData(input).get()); - auto inputShape = model.GetTensorShape(input); // shape is not dynamic if it is constant - size_t inputLength = ConvertShapeToLength(inputShape); - std::copy(inputData, inputData + inputLength, outputData.begin() + offset ); - offset += inputLength; - // data do not need to be written as a weight - model.SetNotWritableInitializedTensor(input); - } - model.AddConstantTensor(fOutput, outputShape, outputData.data()); - if (model.Verbose()) { - std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : " - << ConvertValuesToString(outputData) << std::endl; - } - } - } - if (!fIsOutputConstant) { - model.AddIntermediateTensor(fOutput, model.GetTensorType(fInputs[0]), fOutputShape); - if (model.Verbose()) { - std::cout << "Concat ---> " << fOutput << " " << ConvertDynamicShapeToString(fOutputShape) << std::endl; - } - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; - OpName = "op_"+OpName; - if(fOutputShape.empty()){ - throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first"); - } - std::stringstream out; - out<<"\n//--------- Concat\n"; - // special case when memory is contiguous - bool hasShapeOnes = true; - for(int i = 0; i 0) out << offset; - offset += " + " + length; - out << ");\n"; - } - } - else { - - std::vector outStride = UTILITY::ComputeStrideFromShape(fOutputShape); - std::vector> inStrides(fInputs.size()); - int idx = 0; - for ( auto &s : inStrides) { - s = UTILITY::ComputeStrideFromShape(fInputShapes[idx]); - idx++; - } - for (int i = 0; i < fAxis; ++i) { - // loop on dimensions - out << SP << "for (size_t i" << i << " = 0; i" << i << " < " << fOutputShape[i].GetVal() << "; ++i" << i <<") {\n"; - } - - out << SP << SP << SP << "int idxOut = "; - for (int k = 0; k < fAxis; k++) { - if (k > 0) out << " + "; - out << outStride[k].GetVal() << "*i" << k; - } - out << ";\n"; - - for (size_t j = 0; j < fInputs.size(); j++) { - if (j>0) - out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n"; - out << SP << SP << SP << "int idxIn" << j <<" = "; - for (int k = 0; k < fAxis; k++) { - if (k > 0) out << " + "; - out << inStrides[j][k].GetVal() << "*i" << k; - } - out << ";\n"; - out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n"; - out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n"; - out << SP << SP << SP << "}\n"; - // concatenate the axis values - } - for (int i = 0; i < fAxis; ++i) { - out << SP << "}\n"; - } - } - - return out.str(); - } - }; - }//SOFIE - - #endif //SOFIE_ROPERATOR_CONCAT diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx deleted file mode 100644 index 15ca91e..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx +++ /dev/null @@ -1,531 +0,0 @@ -#ifndef SOFIE_SOFIE_ROPERATOR_CONV -#define SOFIE_SOFIE_ROPERATOR_CONV - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include -#include -#include -#include -#include - - -namespace SOFIE { - -template -class ROperator_Conv final : public ROperator -{ -private: - std::string fAttrAutopad; - std::vector fAttrDilations; - size_t fAttrGroup; - std::vector fAttrKernelShape; - std::vector fAttrPads; - std::vector fAttrStrides; - - std::string fNX; - std::string fNW; - std::string fNB; - std::string fNB2; // bias tensor name after broadcasting - std::string fNY; - - std::string convK; - std::string imcol; - - std::vector fShapeX; - std::vector fShapeW; - std::vector fShapeB; - std::vector fShapeY; - - std::string fType; - - size_t fDim; // dimension of the convolution - - -public: - - ROperator_Conv() {} - - ROperator_Conv(std::string autopad, std::vector dilations, - size_t group, std::vector kernelShape, std::vector pads, - std::vector strides, std::string nameX, std::string nameW, - std::string nameB, std::string nameY): - fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), - fAttrPads(pads), fAttrStrides(strides), - fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), - fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) - { - if(std::is_same::value) { - fType = "float"; - } else { - throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); - } - fInputTensorNames = { fNX, fNB }; - fOutputTensorNames = { fNY }; - } - - ROperator_Conv(std::string autopad, std::vector dilations, - size_t group, std::vector kernelShape, std::vector pads, - std::vector strides, std::string nameX, std::string nameW, - std::string nameY): - fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), - fAttrPads(pads), fAttrStrides(strides), - fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY)) - { - if(std::is_same::value) { - fType = "float"; - } else { - throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); - } - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - ETensorType out = input[0]; - return {out}; - } - - // function returning output shape given input - std::vector> ShapeInference(std::vector> input) override { - // shape of convolution input has to be (according to ONNX): N x C x H x W - // Where N : batch size, C : input channels, H : input height, W : input width - - if (input.size() > 3 ) { - throw - std::runtime_error("TMVA SOFIE Conv Op Shape inference need 2 or 3 input tensors"); - } - for(size_t i = 0; i < input.size(); i++) { - if (input[i].size() -2 != fDim) { - throw - std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid inputs "); - } - } - - if (fAttrGroup == 0) { - fAttrGroup = input[0][1] / input[1][1]; - } - - // kernel shape - size_t k1 = ((fAttrKernelShape.empty())? input[1][2] : fAttrKernelShape[0]); - size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? input[1][3] : fAttrKernelShape[1]) : 1; - size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? input[1][4] : fAttrKernelShape[2]) : 1; - - - size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1; - size_t i2 = (fDim > 2) ? 4 : 3; - size_t i3 = 5; - - if (fAttrDilations.empty()) { - fAttrDilations = {1, 1, 1}; - } - fAttrDilations.resize(3); - if (fDim < 3) { - fAttrDilations.resize(3, 1); - } - // Shape of the kernel - fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1), - k2 + (fAttrDilations[1] - 1) * (k2 - 1), - k3 + (fAttrDilations[2] - 1) * (k3 - 1)}; - - if (fAttrAutopad == "NOTSET") { - if (fAttrPads.empty()) { - fAttrPads = {1, 1, 1, 1, 1, 1}; - } - } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") { - if (fDim == 1) - fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[0] / 2}; - else if (fDim == 2) - fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2}; - else if (fDim == 3) - fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2, - fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2}; - // add extra padding at beginning or end (depending if SAME_UPPER or SAME_LOWER) - // need to check this! - if (fAttrKernelShape[0] % 2 == 1) { - (fAttrAutopad == "SAME_UPPER") ? fAttrPads[0]++ : fAttrPads[i1]++; - } - if (fDim > 1 && fAttrKernelShape[1] % 2 == 1) { - (fAttrAutopad == "SAME_UPPER") ? fAttrPads[1]++ : fAttrPads[i2]++; - } - if (fDim > 2 && fAttrKernelShape[2] % 2 == 1) { - (fAttrAutopad == "SAME_UPPER") ? fAttrPads[2]++ : fAttrPads[i3]++; - } - } else if (fAttrAutopad != "VALID") { - throw - std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad"); - } - // to be sure pad is vector of size 6 - if (fDim < 3) fAttrPads.resize(6, 0); - - if (fAttrStrides.empty()) { - fAttrStrides = {1, 1, 1}; - } - if (fDim < 3) - fAttrStrides.resize(3, 1); - - - size_t input1 = input[0][2]; - size_t input2 = (fDim > 1) ? input[0][3] : 1; - size_t input3 = (fDim > 2) ? input[0][4] : 1; - - size_t pad1 = fAttrPads[0] + fAttrPads[i1]; - size_t output1 = (input1 + pad1 - fAttrKernelShape[0]) / fAttrStrides[0] + 1; - - size_t batch_size = input[0][0]; // first element in input tensor - size_t output_channels = input[1][0]; // first element in weight tensor - - std::vector> ret({{ batch_size, output_channels, output1 }}); - - if (fDim == 1) - return ret; - - size_t pad2 = fAttrPads[1] + fAttrPads[i2]; - size_t output2 = (input2 + pad2 - fAttrKernelShape[1]) / fAttrStrides[1] + 1; - // output is N x M x OH x OW - ret[0].push_back(output2); - if (fDim == 2) - return ret; - - size_t pad3 = fAttrPads[2] + fAttrPads[i3]; - size_t output3 = (input3 + pad3 - fAttrKernelShape[2] ) / fAttrStrides[2] + 1; - - // output is N x M x OH x OW x OD - ret[0].push_back(output3); - return ret; - } - - void Initialize(RModel& model) override { - fUseSession = model.UseSession(); - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw - std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model"); - } - fShapeX = model.GetTensorShape(fNX); - if (fShapeX.size() < 3 || fShapeX.size() > 5) { - std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl; - throw - std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); - } - fDim = fShapeX.size() - 2; - if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw - std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); - } - fShapeW = model.GetTensorShape(fNW); - if (fShapeW.size() < 3 || fShapeW.size() > 5) { - std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl; - throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions"); - } - fShapeY = ShapeInference({fShapeX, fShapeW})[0]; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - if (fNB != "") { - if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw - std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model"); - } - fShapeB = model.GetTensorShape(fNB); - std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); - bool broadcast_needed = !UTILITY::AreSameShape(fShapeB, targetShape); - if (broadcast_needed) { - auto original_data = model.GetInitializedTensorData(fNB); - // make bias shape equal to Y shape by adding 1 - if (fShapeB.size() < 1) - throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape"); - // we assume bias tensor dimension is equal to number of filters that is the second dimension in - // the output tensor - if (fShapeB[0] != fShapeY[1]) - throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " + - ConvertShapeToString(fShapeB)); - if (fType != "float") - throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported"); - // here is the actual broadcasting - if (!fUseSession) { - std::vector shape(fDim + 1, 1); - shape[0] = fShapeB[0]; - std::shared_ptr new_data_ptr( - UTILITY::UnidirectionalBroadcast(static_cast(original_data.get()), shape, targetShape), - std::default_delete()); - model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), targetShape, new_data_ptr); - fShapeB = model.GetTensorShape(fNB); - fNB2 = fNB; // use same name - } - else { - // In case of session add broadcasting code in Session constructor and in GenerateInitCode - // we need to add a new intermediate tensor for broadcasted bias tensor - fNB2 = fNB + "bcast"; - model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape); - } - } - } - - size_t outputChannelSize = fShapeY[2]; // size/channel = D * H * W - size_t kernelSize = fAttrKernelShape[0]; - for (size_t i = 1; i < fDim; i++) { - outputChannelSize *= fShapeY[2 + i]; - kernelSize *= fAttrKernelShape[i]; - } - - std::vector shape1 = {fShapeW[0], fShapeW[1], kernelSize}; - std::vector shape2 = {fShapeW[1], kernelSize, outputChannelSize}; - model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 ); - model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 ); - convK = fNX +"_f"; - imcol = fNX +"_xcol"; - fOutputTensorNames.emplace_back(convK); - fOutputTensorNames.emplace_back(imcol); - } - - std::string GenerateInitCode() override { - std::stringstream out; - // Generate initialization code for broadcasting of bias tensor - if (!fNB2.empty()) { - // include a separate scope to avoid defining unique operator temp variables - std::vector shape(fDim + 1, 1); - shape[0] = fShapeB[0]; - std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); - out << SP << "{\n"; - out << SP << SP << "float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" - << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n"; - out << SP << SP << "std::copy(data, data + " << ConvertShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n"; - out << SP << SP << "delete[] data;\n"; - out << SP << "}\n"; - } - return out.str(); - } - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - - if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) { - throw - std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); - } - - std::stringstream out; - size_t bsize = fShapeX[0]; - size_t kDepth = (fDim > 2) ? fShapeW[2] : 1; // kernel depth - size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; // kernel height - size_t kWidth = fShapeW[fDim+1]; // kernel width - size_t iDepth = (fDim > 2) ? fShapeX[2] : 1; // input depth - size_t iHeight = (fDim > 1) ? fShapeX[fDim] : 1; // input height - size_t iWidth = fShapeX[fDim+1]; // input width - size_t oDepth = (fDim > 2) ? fShapeY[2] : 1; // output depth - size_t oHeight = (fDim > 1) ? fShapeY[fDim] : 1; // ouput height - size_t oWidth = fShapeY[fDim+1]; // output width - - out << "\n//---- operator Conv " << OpName << "\n"; - - // vectorize the (dilated)convolution kernels into a matrix - // no need to transpose the matrix - // to fix for 1d and 3d - - size_t id = (fDim > 2) ? fDim-3 : 2; - size_t ih = (fDim > 1) ? fDim-2 : 1; - size_t iw = fDim-1; - - size_t wstrideDil = fAttrDilations[iw]; - size_t hstride = kWidth; - size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw]; // stride dilated in the height - size_t dstride = kHeight * kWidth; - size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; - size_t icstride = kHeight * kWidth * kDepth; - size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; - size_t ocstride = fShapeW[1] * icstride; - size_t ocstrideDil = fShapeW[1] * icstrideDil; - - out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n"; - out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n"; - if (fDim > 2) - out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n"; - if (fDim > 1) - out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n"; - out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n"; - - out << SP << SP << SP << SP << SP << "tensor_" < 2) out << " + kd * " << dstrideDil; - if (fDim > 1) out << " + kh * " << hstrideDil; - out << " + kw * " << wstrideDil << " ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride; - if (fDim > 2) out << " + kd * " << dstride; - if (fDim > 1) out << " + kh * " << hstride; - out << " + kw ];\n"; - - out << SP << SP << SP << SP << "}\n"; - if (fDim > 1) out << SP << SP << SP << "}\n"; - if (fDim > 2) out << SP << SP << SP << "}\n"; - out << SP << SP << "}\n"; - out << SP << "}\n"; - - //out << SP << "char " << OpName << "_transA = 'T';\n"; - out << SP << "char " << OpName << "_transA = 'N';\n"; - out << SP << "char " << OpName << "_transB = 'N';\n"; - out << SP << "int " << OpName << "_m = " << oHeight * oWidth * oDepth << ";\n"; // output h*w - assert(fShapeY[1] == fShapeW[0]); - assert(fShapeW[1] == fShapeX[1] / fAttrGroup); - out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels - out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n"; - out << SP << "float " << OpName << "_alpha = 1.0;\n"; - out << SP << "float " << OpName << "_beta = 0.0;\n"; - - - // Loop on batch size - out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n"; - - // IM2COL: Unroll the input tensor - // order input data as (e.g. kernel 2x2) and (xa,ya) is channel 1 and (xb,yb) is channel 2 - // (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk) - // (xa2,...xak+1,ya1,...yak)(......) - // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows. - // By doing this one has consecutive memory reads and writes - // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w) - if (fDim ==1) { - if (fAttrPads[0] != fAttrPads[1] ) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " - << std::endl; - fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2; - } - fAttrPads[1] = 0; - fAttrStrides[1] = 1; - } - if (fDim == 2) { - if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; - fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2; - fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2; - } - } - if (fDim == 3) { - if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; - fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2; - fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2; - fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2; - } - } - out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; - - if (fAttrGroup == 1) { - out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iHeight * iWidth << ";\n"; - // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y, output_h * - // output_w) - if (fDim < 3) { - out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX - << " + x_offset," - // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, - // dilation_w, - // - << fShapeW[1] << "," << iHeight << "," << iWidth << ","; - if (fDim == 1) - out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," - << fAttrDilations[0]; - else // dim ==2 - out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] - << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," - << fAttrDilations[1]; - out << "," << "tensor_" <(tensor_" << fNX - << " + x_offset," - // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, - // dilation_d, dilation_h, dilation_w, - // - << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," - << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," - << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << "," - << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << "," - << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << "," - << "tensor_" << fNX << "_xcol);\n\n "; - } - // BLAS - out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &" - << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName - << "_m,\n"; // use m if op_xcol is not transpose , otherwise k - out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY - << " + out_offset, &" << OpName << "_m);\n"; - } else { - // case of group convolution - // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each - // group) - // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; - out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n"; - out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iDepth * iHeight * iWidth << " + g * " - << fShapeW[1] * iDepth * iHeight * iWidth << ";\n "; - out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << " + g * " - << fShapeW[0] * oDepth * oHeight * oWidth / fAttrGroup << ";\n "; - - if (fDim < 3) { - out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX - << " + x_offset," - // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, - // dilation_w, - // - << fShapeW[1] << "," << iHeight << "," << iWidth << ","; - if (fDim == 1) - out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," - << fAttrDilations[0]; - else // dim ==2 - out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] - << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," - << fAttrDilations[1]; - out << ", tensor_" << fNX << "_xcol);\n\n "; - } else { - // 3d im2col - out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX - << " + x_offset," - // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, - // dilation_d, dilation_h, dilation_w, - // - << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << "," - << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] - << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] - << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX - << "_xcol);\n\n "; - } - - // BLAS - // n must be divided by the number of groups - out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n"; - // offset g must be g * k * n - out << SP << SP << SP << "size_t offset_f = g * " - << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup - << ";\n"; - out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &" - << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, tensor_" << fNX << "_xcol, &" << OpName - << "_m,\n"; // use m if op_xcol is not transpose , otherwise k - out << SP << SP << SP << "tensor_" << fNX << "_f + offset_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY - << " + out_offset" - << ", &" << OpName << "_m);\n"; - - out << SP << SP << "}\n"; // end of group loop - } - - if (fNB2 != "") { - out << SP << "int " << OpName << "_size = " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; - out << SP << "float " << OpName << "_gamma = 1.0;\n"; - out << SP << "int " << OpName << "_incx = 1;\n"; - out << SP << "int " << OpName << "_incy = 1;\n"; - - out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &" - << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n"; - - } - out << SP << "}\n"; // end of batch size loop - - return out.str(); - } - - /*! \brief Returns the blas routines needed to compile the generated code - */ - std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } -}; - -} // namespace SOFIE - -#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx deleted file mode 100644 index c834a06..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef SOFIE_ROperator_Expand -#define SOFIE_ROperator_Expand - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Expand final : public ROperator{ -private: - - std::vector fShapeX; - std::vector fShape; - std::vector fShapeY; - - std::string fNX; - std::string fNShape; - std::string fNY; - std::string fType; - - bool fInitialized = false; - -public: - ROperator_Expand(){} - ROperator_Expand(std::string nameX, std::string nameShape, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNShape(UTILITY::Clean_name(nameShape)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - return input; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE Expand Op Input Tensor " + fNX + " is not found in model"); - } - fShapeX = model.GetTensorShape(fNX); - if (!model.IsInitializedTensor(fNShape)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNShape + " is not initialized."); - } - int64_t *shapeData = - static_cast(model.GetInitializedTensorData(fNShape).get()); - fShape = model.GetTensorShape(fNShape); - if (fShape.size() != 1) { - throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor."); - } - size_t N = fShape[0]; - std::vector shape(shapeData, shapeData + N); - // Y is the common shape of fShapeX and shape - fShapeY = SOFIE::UTILITY::UnidirectionalBroadcastShape( - fShapeX, shape); - fInitialized = model.IsInitializedTensor(fNX); - // Broadcast X to the common shape fShapeY - bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY); - if (model.IsInitializedTensor(fNX)) { - // If X is an initialized tensor (constant) - auto data = model.GetInitializedTensorData(fNX); - if (broadcast) { - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX, fShapeY), - std::default_delete()); - // Update the data and the shape of X - model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), fShapeY, broadcastedData); - fShapeX = fShapeY; - // need to set as a not writable tensor - model.SetNotWritableInitializedTensor(fNX); - data = broadcastedData; - } - if (broadcast || model.IsConstantTensor(fNX)) { - fIsOutputConstant = true; // constant output in this case - model.AddConstantTensor(fNY, model.GetTensorType(fNX), fShapeY, data); - fOutputTensorNames.pop_back(); - } else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - } - } else { - // case input is not initialized - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - } - fType = ConvertTypeToString(model.GetTensorType(fNX)); - if (model.Verbose()) - std::cout << "Expand - output is with shape " << ConvertShapeToString(fShapeY) << std::endl; - } - - std::string GenerateInitCode() override { - std::stringstream out; - if (!fIsOutputConstant && (fInitialized || fShapeX == fShapeY ) ) { - size_t length = ConvertShapeToLength(fShapeY); - out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n"; - out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; - } - return out.str(); - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; - OpName = "op_" + OpName; - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Expand Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//------ Expand Op" << "\n"; - // No need to broadcast A if it's an initialized tensor or shapes are the same - if (!fInitialized && fShapeX != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" << fNX << ", " << ConvertShapeToString(fShapeX) << ", " << ConvertShapeToString(fShapeY) - << ", std::span<"<(tensor_"< -#include -#include - - -namespace SOFIE{ - -class ROperator_Gather final : public ROperator -{ -private: - - int64_t fAttrAxis = 0; - - std::string fNX; - std::string fNIndices; - std::string fNY; - - std::vector fShapeX; - std::vector fShapeIndices; - std::vector fShapeY; - - std::vector fIndices; // indices vector in case they are known at initialization - - std::string fType; - -public: - ROperator_Gather(){} - ROperator_Gather(int64_t attrAxis, std::string nameX, std::string nameIndices, std::string nameY): - fAttrAxis(attrAxis), fNX(UTILITY::Clean_name(nameX)), fNIndices(UTILITY::Clean_name(nameIndices)), fNY(UTILITY::Clean_name(nameY)) { - fInputTensorNames = { fNX, fNIndices }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; - return ret; - } - - void Initialize(RModel& model) override { - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE Gather Op Input Tensor " + fNX + " is not found in model"); - } - fShapeX = model.GetTensorShape(fNX); - fShapeIndices = model.GetTensorShape(fNIndices); - size_t q = fShapeIndices.size(); - // Axis in range [0, r) where r=rank(X) - size_t r = fShapeX.size(); - // Set the axis - if (fAttrAxis < 0) { - fAttrAxis = fAttrAxis + int64_t(r); - } - // empty fShapeIndices is a scalar value for the indices - size_t indicesLength = ConvertShapeToLength(fShapeIndices); - - // case indices tensor is initialized - if (model.IsInitializedTensor(fNIndices)) { - int64_t* indicesData = static_cast(model.GetInitializedTensorData(fNIndices).get()); - //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code) - model.SetNotWritableInitializedTensor(fNIndices); - // update indices data in case of negative dim values - for (size_t i = 0; i < indicesLength; i++) { - if (indicesData[i] < 0) { - indicesData[i] += fShapeX[fAttrAxis]; - } - } - // Save in a vector gather Indices of size q - fIndices = std::vector(indicesData, indicesData + indicesLength); - } - // Output shape - if (model.Verbose()) - std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertShapeToString(fShapeIndices) << std::endl; - - if (fShapeY.empty()) { - fShapeY.resize(q + r - 1); - if (fAttrAxis > 0) { - // Copy shape of X[0, ..., axis) to Shape of Y[0, ..., axis) - std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin()); - } - // Set shape of Y[axis, ..., axis + q) - for (size_t i = 0; i < q; i++) { - fShapeY[fAttrAxis + i] = fShapeIndices[i]; - } - // Copy shape of X[axis + 1, ..., axis + r) to shape of Y[axis + q, ... q + r - 1) - std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q); - } - // case input is known (type is an integer) and input indices is a scalar (or vector of size 1) - if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) { - if (model.GetTensorType(fNX) == ETensorType::INT64) { - auto inputData = static_cast(model.GetInitializedTensorData(fNX).get()); - // if q <=1 and r = 1 output length = 1 (it is a scalar) - std::vector outputData(ConvertShapeToLength(fShapeY)); - outputData[0] = inputData[fIndices[0]]; - model.AddConstantTensor(fNY, fShapeY, outputData.data()); - if (model.Verbose()) - std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl; - fIsOutputConstant = true; - } - } - if (!fIsOutputConstant) { - // Add output tensor - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - fType = ConvertTypeToString(model.GetTensorType(fNX)); - if (model.Verbose()) - std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << std::endl; - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) { - // no code to generate here for constant output. Tensor output is defined in Session constructor - return "//---------------------------------------\n"; - } - OpName = "op_" + OpName; - std::stringstream out; - out << "//--------- Gather operator \n"; - // The shape of the output is q + r - 1 - size_t r = fShapeX.size(); - // Indices of shape q - size_t q = fShapeIndices.size(); - // Strides - std::vector stridesX = UTILITY::ComputeStrideFromShape(fShapeX); - std::vector stridesY = UTILITY::ComputeStrideFromShape(fShapeY); - std::vector stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); - - // case fIndices is not known we need to correct for negative axis indices at run-time - if (fIndices.empty()) { - size_t indicesLength = ConvertShapeToLength(fShapeIndices); - out << SP << "// correct in case of negative gather indices\n"; - out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n"; - out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n"; - out << SP << SP << SP << "tensor_" << fNIndices << "[i] += " << fShapeX[fAttrAxis] << ";\n"; - out << SP << "}\n"; - } - - - // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}] - // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1) - // iterate in [0 ... axis) [0 ... q) [axis ... r - 1) - // for j_0, j_1, ..., j_{axis-1} - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - std::string index = "j_" + std::to_string(j); - out << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n"; - } - // for i_0, i_1, ..., i_{q - 1} - if (q == 0) - out << SP << SP << "{\n"; // add a scope for local variables - for (size_t i = 0; i < q; i++) { - std::string index = "i_" + std::to_string(i); - out << SP << SP << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n"; - } - // for j_axis, j_{axis + 1}, ..., j_{r - 1} - for (size_t j = fAttrAxis; j + 1 < r; j++) { - std::string index = "j_" + std::to_string(j); - out << SP << SP << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n"; - } - - out << SP << SP << SP << "size_t y_index = 0;\n"; - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[j] << ";\n"; - } - for (size_t i = 0; i < q; i++) { - out << SP << SP << SP << "y_index += i_" + std::to_string(i) + " * " << stridesY[fAttrAxis + i] << ";\n"; - } - for (size_t j = fAttrAxis; j + 1 < r; j++) { - out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[q + j] << ";\n"; - } - // Indices - out << SP << SP << SP << "size_t i_index = 0;\n"; - for (size_t i = 0; i < q; i++) { - out << SP << SP << SP << "i_index += i_" + std::to_string(i) + " * " << stridesIndices[i] << ";\n"; - } - // K - out << SP << SP << SP << "size_t k = static_cast(" << "tensor_" << fNIndices << "[i_index]" << ");\n"; - // Input - out << SP << SP << SP << "size_t x_index = k * " << stridesX[fAttrAxis] << ";\n"; - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << SP << SP << "x_index += j_" + std::to_string(j) + " * " << stridesX[j] << ";\n"; - } - for (size_t j = fAttrAxis + 1; j < r; j++) { - out << SP << SP << SP << "x_index += j_" + std::to_string(j - 1) + " * " << stridesX[j] << ";\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n"; - - // end loops j_k, j_{k + 1}, ..., j_{r - 2} - for (size_t j = fAttrAxis; j + 1 < r; j++) { - out << SP << SP << SP << "}\n"; - } - // end loops i_0, i_1, ..., i_{q - 1} - if (q == 0) - out << SP << SP << "}\n"; // end of scope for q = 0 - for (size_t i = 0; i < q; i++) { - out << SP << SP << "}\n"; - } - // end loops j_0, j_1, ..., j_{axis - 1} - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << "}\n"; - } - - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_RELU diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx deleted file mode 100644 index 046bf56..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx +++ /dev/null @@ -1,399 +0,0 @@ -#ifndef SOFIE_ROPERATOR_GEMM -#define SOFIE_ROPERATOR_GEMM - - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include -#include -#include -#include -#include - - -namespace SOFIE{ - - - template - class ROperator_Gemm final : public ROperator - { - - private: - bool fIsDynamic = false; - - float fAttrAlpha = 1.0; - float fAttrBeta = 1.0; - int_t fAttrTransA = 0; - int_t fAttrTransB = 0; - - std::string fNA; - std::string fNB; - std::string fNC = ""; - std::string fNC2; // bias tensor name after broadcasting - std::string fNY; - std::string fType; - EActivationType fActivation; - std::vector fShapeA; - std::vector fShapeB; - std::vector fShapeC; - std::vector fShapeY; - - public: - - ROperator_Gemm(){} - ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): - fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), - fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) - { - fActivation = activation; - fType = "float"; - static_assert(std::is_same_v, - "TMVA::SOFIE - Unsupported type parsing a Gemm operator"); - fInputTensorNames = { fNA, fNB }; - fOutputTensorNames = { fNY }; - } - - ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameC, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): - fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), - fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation) - { - fActivation = activation; - fType = "float"; - - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - ETensorType out = input[0]; - return {out}; - } - - template - std::vector> DoShapeInference(const std::vector> & input){ - if (input.size() > 3) throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor"); - // accept tensor with input dimensions > 2 - // example: A = (d1,d2,...,N1,N2) B = (d1,d2,...,N2,N3) --> Y = (d1,d2,..,N1,N3) - for (auto& i: input){ - if (i.size() < 2){ - throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only accept input tensor with >=2 dimensions"); - } - } - - std::vector> ret; - // when there are 3 inputs shape of Y is the one of C - if (input.size() == 3){ - ret.push_back(input[2]); //shape of C is shape of Y - return ret; - } - // ioffset cannot be less than 2 - int ioffset = input[0].size()-2; // in case of tensors with dim > 2 - - std::vector s_a(input[0].begin() + ioffset, input[0].begin() + ioffset + 2); - std::vector s_b(input[1].begin() + ioffset, input[1].begin() + ioffset + 2); - // reverse in case of transpose - if (fAttrTransA){ - std::reverse(s_a.begin(), s_a.end()); - } - if (fAttrTransB){ - std::reverse(s_b.begin(), s_b.end()); - } - std::vector s_y; - s_y.reserve(input[0].size()); - if (input[0].size() > 2 && input[1].size() == input[0].size()) { - // in case of dim > 2 first dimensions are equal to the input ones not - // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4)) - for (size_t i = 0; i < input[0].size()-2; i++) { - Dim valueA = input[0][i]; - Dim valueB = input[1][i]; - if (valueA.GetVal() != valueB.GetVal()) { - if (valueB.GetVal() == "1") - s_y.push_back(input[0][i]); - else if (valueA.GetVal() == "1") - s_y.push_back(input[1][i]); - else - throw std::runtime_error("TMVA SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " - + valueB.GetVal()); - } - s_y.push_back(input[0][i]); - } - } - - s_y.push_back(s_a[0]); - s_y.push_back(s_b[1]); - ret.push_back(s_y); - return ret; - } - - std::vector> ShapeInference(std::vector> input) override { - return DoShapeInference(input); - } - std::vector> DynamicShapeInference(const std::vector> & input){ - return DoShapeInference(input); - } - - - - void Initialize(RModel& model) override { - //TODO: propagate A or B as specified by ONNX standard - - if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model"); - } - if (fNC != ""){ - if (model.CheckIfTensorAlreadyExist(fNC) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is not found in model"); - } - } - if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) { - fShapeA = model.GetDynamicTensorShape(fNA); - fIsDynamic = true; - } else { - auto shapeA_int = model.GetTensorShape(fNA); - fShapeA = ConvertShapeToDim(shapeA_int); - } - // case A is of dim1 we prepend a 1 but we need to remove later - bool prependOne = false; - if (fShapeA.size() == 1) { - fShapeA.insert(fShapeA.begin(), Dim(1)); - prependOne = true; - } - - if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) { - fShapeB = model.GetDynamicTensorShape(fNB); - fIsDynamic = true; - } - else { - auto shapeB_int = model.GetTensorShape(fNB); - fShapeB = ConvertShapeToDim(shapeB_int); - } - // case B is dim1 we append a 1 but we need to remove later - bool appendOne = false; - if (fShapeB.size() == 1) { - fShapeB.insert(fShapeB.end(), Dim(1)); - appendOne = true; - } - // assume if not shape is 2 that extra values are 1. - // implement also MatMul case where we stack matrices (see numpy.matmul) - if (fShapeA.size() != fShapeB.size()) { - // if different dimensions we prepend 1 values - if (fShapeA.size() < fShapeB.size()) { - fShapeA.insert(fShapeA.begin(), fShapeB.size()-fShapeA.size(), Dim(1)); - } else if (fShapeB.size() < fShapeA.size()) { - fShapeB.insert(fShapeB.begin(), fShapeA.size()-fShapeB.size(), Dim(1)); - } - } - - fShapeY = DynamicShapeInference({fShapeA, fShapeB})[0]; - std::vector shapeY; - if (!fIsDynamic) { - shapeY = ConvertShapeToInt(fShapeY); - if (shapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDynamicShapeToString(fShapeY)); - } - } - - // bias is normally not dynamic (not support it for time being) - if (fNC != ""){ - // normally bias is fixed and not dynamic - if (model.IsDynamicTensor(fNC)) { - throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported"); - } - fShapeC = model.GetTensorShape(fNC); - fNC2 = fNC; - size_t lengthC = ConvertShapeToLength(fShapeC); - size_t lengthY = ConvertShapeToLength(shapeY); - // for dynamic outputs broadcasting is always done - bool broadcast_needed = lengthC != lengthY; - - - if (broadcast_needed) { - if (!model.UseSession()) { - // without session dynamic tensors not supported in Gemm - if (fIsDynamic) { - throw std::runtime_error("TMVA SOFIE Gemm Op: dynamic tensors not supported without a session"); - } - auto original_data = model.GetInitializedTensorData(fNC); - auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY); - if (fType == "float") { - std::shared_ptr new_data_ptr(UTILITY::UnidirectionalBroadcast( - static_cast(original_data.get()), fShapeC, targetShape), - std::default_delete()); - - model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr); - fShapeC = shapeY; - } - } else { - // In case of session add broadcasting code in Session constructor and in GenerateInitCode - // we need to add a new intermediate tensor for broadcasted bias tensor - fNC2 = fNC + "bcast"; - if (!fIsDynamic) { - model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY); - } - else - model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY); - } - } - } - - // remove appended or prepended value of 1 - if (prependOne) { - if (fIsDynamic) - fShapeY.erase(fShapeY.begin()); - else - shapeY.erase(shapeY.begin()); - } - if (appendOne) { - if (fIsDynamic) - fShapeY.erase(fShapeY.end()-1); - else - shapeY.erase(shapeY.end()-1); - } - - if (!fIsDynamic) - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY); - else - model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY); - - if (model.Verbose()){ - std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape "; - if (fIsDynamic) - std::cout << ConvertDynamicShapeToString(fShapeY) << std::endl; - else - std::cout << ConvertShapeToString(shapeY) << std::endl; - } - - model.AddNeededStdLib("algorithm"); - } - - std::string GenerateInitCode() override { - std::stringstream out; - // generate initialization code for broadcasting of bias tensor - if (fShapeC.size() != fShapeY.size() && fNC != fNC2) { - // we broadcast here always C in Y output, so target shape is the one of Y - // no need to call UTILITY::UnidirectionalBroadcastShape. - // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code. - auto targetShape = fShapeY; - // include a separate scope to avoid defining unique operator temp variables - out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n"; - out << SP << "{\n"; - out << " float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" - << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDynamicShapeToString(fShapeY) << ");\n"; - auto length = SOFIE::ConvertDynamicShapeToLength(fShapeY); // output size - out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n"; - out << SP << SP << "delete [] data;\n"; - out << SP << "}\n"; - } - return out.str(); - } - - std::string Generate(std::string opName) override { - opName = "op_" + opName; - - if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) { - throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first"); - } - std::stringstream out; - out << "\n//--------- Gemm\n"; - out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n"; - out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n"; - // need to consider case A and B have dim > 2 (for MatMul) - int64_t dimA = fShapeA.size(); - int64_t dimB = fShapeB.size(); - int64_t dimY = fShapeY.size(); - if (dimA != dimB || dimA != dimY) { - throw std::runtime_error("TMVA SOFIE Gemm(MatMul) has invalid shape for inputs or output"); - } - auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); - auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); - auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); - std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; - // extra dimensions in case of stacked MatMul - std::vector sA; - for (int64_t i = 0; i < dimY-2; i++) { - sA.push_back(fShapeY[i]); - } - auto lengthGemm = ConvertDynamicShapeToLength(sY); // size of the Gemm operation - auto lengthExtra = ConvertDynamicShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) - - out << SP << "int " << opName << "_m = " << m << ";\n"; - out << SP << "int " << opName << "_n = " << n << ";\n"; - out << SP << "int " << opName << "_k = " << k << ";\n"; - out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ";\n"; - out << SP << "float " << opName << "_beta = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ";\n"; - out << SP << "int " << opName << "_lda = " << (fAttrTransA ? m : k) << ";\n"; - out << SP << "int " << opName << "_ldb = " << (fAttrTransB ? k : n) << ";\n"; - - // case bias is present - if (!fNC.empty()){ - if (fNC2 == fNC) { - // add a check in case broadcasting was not needed or done outside of session - // C should have smaller dimension of Y - if (!fIsDynamic) { - if (std::stoi(lengthGemm) != static_cast(ConvertShapeToLength(fShapeC))) - throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor has not correct size " - + ConvertShapeToString(fShapeC) + " output length " + lengthGemm); - } else { - // add a dynamic check (C should not be a dynamic tensor) - out << SP << "assert(" << lengthGemm << " != " << ConvertShapeToLength(fShapeC) << ");\n"; - } - } - } else { - //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use - // the previous result - if (fAttrBeta != 0) { - throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero"); - } - } - - // include MatMul case where we stack the Gemm operations - // exclude case where we have only 1's in the additional dims - bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra) > 1); - if (doStackMul) { - out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations - out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n"; - out << SP; - } - // in the case of bias - if (!fNC.empty()){ - out << SP << "std::copy(" << "tensor_" << fNC2 << ", " << "tensor_" << fNC2 << " + " << lengthGemm << ", " - << "tensor_" << fNY; - if (doStackMul) out << " + " << opName << "_yoffset"; - out << ");\n"; - } - - - if (fType == "float"){ - - out << SP << "BLAS::sgemm_(&" << opName << "_transB, &" << opName << "_transA, &" << opName - << "_n, &" << opName << "_m, &" << opName << "_k, &" << opName << "_alpha, " << "tensor_" << fNB - << ", &" << opName << "_ldb, " << "tensor_" << fNA << ", &" << opName << "_lda, &" << opName << "_beta, " - << "tensor_" << fNY; - if (doStackMul) out << " + " << opName << "_yoffset"; - out << ", &" << opName << "_n);\n"; - - if(fActivation == EActivationType::RELU){ - out << SP << "for (int id = 0; id < " << SOFIE::ConvertDynamicShapeToLength(fShapeY) << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n"; - out << SP << "}\n"; - } - } - - if (doStackMul) { - out << SP << SP << opName << "_yoffset += " << lengthGemm << ";\n"; - out << "}\n"; // end of loop on the stacked multiplications - } - - return out.str(); - } - - std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; } - - }; - - -}//SOFIE - -#endif //SOFIE_ROPERATOR_GEMM diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx deleted file mode 100644 index 17b77b3..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx +++ /dev/null @@ -1,343 +0,0 @@ -#ifndef SOFIE_ROPERATOR_LAYERNORMALIZATION -#define SOFIE_ROPERATOR_LAYERNORMALIZATION - -#include "SOFIE/RModel.hxx" -#include "SOFIE/SOFIE_common.hxx" - -#include -#include - - -namespace SOFIE { - -template -class ROperator_LayerNormalization : public ROperator { -private: - int fAttrAxis; - float fAttrEpsilon; - size_t fAttrStashType; - - std::string fNX; - std::string fNScale; - std::string fNB; - std::string fNY; - std::string fNMean; - std::string fNInvStdDev; - - std::string fNCastedX; - std::string fNNormalizedX; - std::string fNBroadcastedB; - - std::vector fShapeX; - std::vector fShapeScale; - std::vector fShapeB; // shape of input Bias (B) is assumed to be fully defined - std::vector fShapeY; - std::vector fShapeMean; - std::vector fShapeInvStdDev; - - size_t fAxis; // axis in [0, size) - size_t fSize; // Size of the input - // size_t fAxisDim; - - std::vector fNormalizedShape; - std::vector fAxesShape; - // lengths in string format - std::string fLength; // Length of the input - std::string fNormalizedLength; - std::string fAxesLength; - - std::string fType; - -public: - ROperator_LayerNormalization() {} - - ROperator_LayerNormalization(int axis, float epsilon, size_t stashType, const std::string &nameX, - const std::string &nameScale, const std::string &nameB, const std::string &nameY, - const std::string &nameMean, const std::string &nameInvStdDev) - : fAttrAxis(axis), fAttrEpsilon(epsilon), fAttrStashType(stashType), fNX(UTILITY::Clean_name(nameX)), - fNScale(UTILITY::Clean_name(nameScale)), fNB(UTILITY::Clean_name(nameB)), - fNY(UTILITY::Clean_name(nameY)), fNMean(UTILITY::Clean_name(nameMean)), fNInvStdDev(UTILITY::Clean_name(nameInvStdDev)) - { - fInputTensorNames = { fNX, fNScale }; - if (!fNB.empty()){ - fInputTensorNames.emplace_back(fNB); - } - - fOutputTensorNames = { fNY }; - if (!fNMean.empty()){ - fOutputTensorNames.emplace_back(fNMean); - } - if (!fNInvStdDev.empty()){ - fOutputTensorNames.emplace_back(fNInvStdDev); - } - } - - std::vector> ShapeInference(std::vector> input) override { return input; } - - std::vector TypeInference(std::vector input) override { return input; } - - void Initialize(RModel& model) override { - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); - } - bool isDynamic = model.IsDynamicTensor(fNX); - fShapeX = model.GetDynamicTensorShape(fNX); - fShapeY = fShapeX; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - // Type of the output - fType = ConvertTypeToString(model.GetTensorType(fNX)); - // Size of the input - fSize = fShapeX.size(); - // Axis in [0, size) - fAxis = (fAttrAxis < 0) ? fSize + fAttrAxis : fAttrAxis; - // Shape of fShapeX[0, ..., fAxis) - fAxesShape = std::vector(fShapeX.begin(), fShapeX.begin() + fAxis); - // Length of the axes - fAxesLength = ConvertDynamicShapeToLength(fAxesShape); - // Shape of fShapeX[fAxis, ..., fSize) - fNormalizedShape = std::vector(fShapeX.begin() + fAxis, fShapeX.end()); - // Length of the normalized axis - fNormalizedLength = ConvertDynamicShapeToLength(fNormalizedShape); - // length of the input - fLength = ConvertDynamicShapeToLength(fShapeX); - // Type of mean and std - ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX); - // Mean - if (fNMean.empty()) { - fNMean = "Mean" + fNX; - // cannot use initializer list with one element since it is ambiguous - if (isDynamic) - // add size_t(-1) to indicate that shape is an expression - model.AddIntermediateTensor(fNMean, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); - else - model.AddIntermediateTensor(fNMean, type, std::vector(1,std::stoi(fAxesLength))); - } - // Inverse Standard Deviation - if (fNInvStdDev.empty()) { - fNInvStdDev = "InvStdDev" + fNX; - if (isDynamic) - model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); - else - model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,std::stoi(fAxesLength))); - } - // Cast X to float - if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) { - fNCastedX = "Casted" + fNX; - model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); - fNNormalizedX = "Normalized" + fNX; - model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); - } - // Broadcast the bias - if (!fNB.empty()) { - fShapeB = model.GetTensorShape(fNB); - size_t lengthB = ConvertShapeToLength(fShapeB); - if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { - fNBroadcastedB = "Broadcasted" + fNB; - model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); - } - } - model.AddNeededStdLib("cmath"); - } - - std::string GenerateInitCode() override - { - std::stringstream out; - if (!fNBroadcastedB.empty()) { - out << SP << "// Broadcasting the bias of LayerNormalization op\n"; - out << SP << "{\n"; - out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"; - out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertDynamicShapeToString(fShapeX) << ");\n"; - out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n"; - out << SP << "delete[] data;\n"; - out << SP << "}\n"; - } - return out.str(); - } - - std::string Generate(std::string opName) override - { - opName = "op_" + opName; - if (fShapeX.empty()) { - throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName + - " called to generate without being initialized first."); - } - if (fShapeX.size() > 5) { - throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not " - "implemented for input tensor of size > 5."); - } - - std::stringstream out; - - out << "//---- Layer Normalization operator " << opName << "\n"; - - // Loop over all the normalized axes i.e. [axis, ..., size) - std::vector inputShape(fSize); - - for (size_t i = 0; i < fSize; i++) { - inputShape[i] = fShapeX[i].GetVal(); - } - - auto strides = UTILITY::ComputeStrideFromShape(fShapeX); - std::string InputIndex = "axis_0 * " + strides[0].GetVal(); - for (size_t i = 1; i < fSize; i++) { - InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal(); - } - - auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); - std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal(); - for (size_t i = 1; i < fAxis; i++) { - axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal(); - } - - auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape); - std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal(); - for (size_t i = fAxis + 1; i < fSize; i++) { - normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal(); - } - - if (!fNCastedX.empty()) { - // Cast X to float - out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n"; - out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast(tensor_" << fNX; - out << "[i]);\n"; - out << SP << "}\n"; - } - - out << SP << "// Compute the mean\n"; - // Loop over the normalized dimensions - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++) {\n"; - } - out << SP << SP << fType << " sum = 0.;\n"; - // loop over all the dims in [0, fAxis) - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++) {\n"; - } - out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "("; - out << fNormalizedLength << ");\n"; - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - - out << SP << "// Compute the inverse Standard Deviation\n"; - // Loop over the normalized dimensions - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - // Set sum = 0 - out << SP << SP << fType << " sum = 0.;\n"; - // loop over all the dims in [0, fAxis) - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_" - << fNMean << "[" << axesIndex << "];\n"; - out << SP << SP << SP << "sum += tmp*tmp;\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt("; - out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n"; - for (size_t i = 0; i < fAxis; i++) { - out << SP << "}\n"; - } - - if (!fNCastedX.empty()) { - out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_"; - out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex; - out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - out << "// Y = Scale o NormalizedX"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex; - out << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - } else { - out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex; - out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "["; - out << axesIndex << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - } - - if (!fNB.empty()) { - std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB); - out << SP << "// Add the bias to Y\n"; - out << SP << "int " << opName << "_n = " << fLength << ";\n"; - out << SP << "float " << opName << "_alpha = 1.;\n"; - out << SP << "int " << opName << "_inc = 1;\n"; - out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &"; - out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n"; - } - - return out.str(); - } - - std::vector GetBlasRoutines() override { return { std::string("Axpy") }; } - - std::vector GetStdLibs() override { return { std::string("cmath") }; } -}; - -} // namespace SOFIE - - -#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx deleted file mode 100644 index 8fefa6d..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef SOFIE_ROPERATOR_LeakyRelu -#define SOFIE_ROPERATOR_LeakyRelu - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_LeakyRelu final : public ROperator -{ - -private: - - /* Attributes*/ - float falpha=0.01; //default value - std::string fNX; - std::string fNY; - std::vector fShape; - std::string fType; - -public: - ROperator_LeakyRelu(){} - ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY): - falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) - { - if(std::is_same::value){ - fType = "float"; - } - else{ - throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Leaky Relu operator"); - } - - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Leaky Relu Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - } - - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Leaky Relu called to Generate without being initialized first"); - } - std::stringstream out; - size_t length = ConvertShapeToLength(fShape); - - out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; - - out << "\n//------ LEAKY RELU\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * tensor_"<< fNX<<"[id]);\n"; - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_LeakyRelu diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx deleted file mode 100644 index 8062dca..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef SOFIE_ROPERATOR_RELU -#define SOFIE_ROPERATOR_RELU - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Relu final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - -public: - ROperator_Relu(){} - ROperator_Relu(std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Relu Op Input Tensor " + fNX + " is not found in model"); - } - - fShape = model.GetDynamicTensorShape(fNX); - - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - if (model.Verbose()) { - std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDynamicShapeToString(fShape) << std::endl; - } - } - - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first"); - } - std::stringstream out; - auto length = ConvertDynamicShapeToLength(fShape); - out << "\n//------ RELU\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n"; - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_RELU diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx deleted file mode 100644 index 66a7e09..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx +++ /dev/null @@ -1,252 +0,0 @@ -#ifndef SOFIE_ROPERATOR_RESHAPE -#define SOFIE_ROPERATOR_RESHAPE - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include - -namespace SOFIE{ - -enum ReshapeOpMode { Reshape, Flatten, Squeeze, Unsqueeze }; - - -class ROperator_Reshape final : public ROperator -{ - -private: - - bool fVerbose = false; - ReshapeOpMode fOpMode = Reshape; // type of Reshape operator - - int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape - int fAxis = 1; // (for Flatten) - - std::string fNData; // input data tensor name - std::string fNShape; // reshape tensor name - std::string fNOutput; // output tensor name - std::vector fShapeInput; // input shape data - std::vector fShapeOutput; // output shape data - std::vector fAttrAxes; // axes attributes (provided for all version of Squeeze/Unsqueeze) - -public: - - std::string Name() const { - if (fOpMode == Reshape) return "Reshape"; - if (fOpMode == Flatten) return "Flatten"; - if (fOpMode == Squeeze) return "Squeeze"; - if (fOpMode == Unsqueeze) return "Unsqueeze"; - return ""; - } - - ROperator_Reshape(){} - ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameShape, std::string nameOutput) - : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNShape(UTILITY::Clean_name(nameShape)), - fNOutput(UTILITY::Clean_name(nameOutput)) - { - if (opMode == Reshape) fAllowZero = attr_value; - if (opMode == Flatten) fAxis = attr_value; - - fInputTensorNames = { fNData }; - if(!fNShape.empty()){ - fInputTensorNames.emplace_back(fNShape); - } - fOutputTensorNames = { fNOutput }; - } - - // for squeeze/unsqueezed operators following old ONNX version (< 10) - // In this cases axes are passed as attribute values - ROperator_Reshape(ReshapeOpMode opMode, std::vector attrAxes, std::string nameData, std::string nameOutput) - : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)), - fAttrAxes(attrAxes) - { - assert(fOpMode == Squeeze || fOpMode == Unsqueeze); - } - - // output type is same as input - std::vector TypeInference(std::vector input) override { - auto ret = std::vector(1, input[0]); - return ret; - } - - // output shape - std::vector> ShapeInference(std::vector> input) override { - std::vector> ret; - auto & input_shape = input[0]; - - if (fOpMode == Reshape) { - if (input.size() != 2) throw std::runtime_error("TMVA SOFIE Reshape Op needs 2 input tensors"); - auto output_shape = input[1]; // the provided shape - size_t input_length = ConvertShapeToLength(input_shape); - size_t output_length = ConvertShapeToLength(output_shape); - // (input_length == output_length) is the easy case : (2,3,4) -> (2,12) - if (input_length != output_length) { - if ((output_length == 0 && fAllowZero == 0) || static_cast(output_length) < 0) { - // in this case value 0 or -1 in shape are automatically corrected - bool replacementDone = false; - for (size_t i = 0; i < output_shape.size(); i++) { - if (output_shape[i] == 0 || output_shape[i] == static_cast(-1)) { - if (replacementDone) { - throw std::runtime_error("TMVA Reshape Op : output shape has multiple negative or zero values"); - } - auto tmp = output_shape; - tmp.erase(tmp.begin() + i); - auto tmp_length = ConvertShapeToLength(tmp); - output_shape[i] = input_length / tmp_length; - replacementDone = true; - } - } - if (fVerbose) - std::cout << "Reshape: correct output shape from " << ConvertShapeToString(input[1]) - << " to " << ConvertShapeToString(output_shape) << std::endl; - } - if (ConvertShapeToLength(output_shape) != input_length) { - throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertShapeToString(input_shape) + - ConvertShapeToString(output_shape)); - } - } - ret.push_back(output_shape); - - } else if (fOpMode == Flatten) { - // flattenig case - size_t inputSize = ConvertShapeToLength(input_shape); - size_t b = input[0][0]; - std::vector newShape = {b, inputSize / b}; - ret.push_back(newShape); - - } else if (fOpMode == Squeeze) { - // squeeze - // assume no axis is provided - remove all axes with value equal to 1 - auto output_shape = input[0]; - if (input.size() == 1) { - size_t i = 0; - while (i < output_shape.size()) { - if (output_shape[i] == 1 ) { - output_shape.erase(output_shape.begin() + i); - } - else { - i++; - } - } - } else if (input.size() == 2) { - auto & axes = input[1]; - for (size_t i = 0; i < axes.size(); i++){ - if (output_shape[axes[i]] != 1) - throw std::runtime_error("TMVA Squeeze Op : Invalid axes : " + ConvertShapeToString(axes) + - ConvertShapeToString(output_shape)); - output_shape.erase(output_shape.begin() + axes[i]); - } - } - ret.push_back(output_shape); - } - - else if (fOpMode == Unsqueeze) { - // unsqueeze - assert(input.size() == 2); - auto output_shape = input[0]; - auto &axes = input[1]; - // output rank - int64_t r = input[0].size() + axes.size(); - for (auto & a : axes) { - int64_t i = static_cast(a); - if ( i < -r || i > r - 1 ) - throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range"); - if (i >= 0) - output_shape.insert(output_shape.begin() + i, 1); - else - //negative axes - output_shape.insert(output_shape.end() + i + 1, 1); - } - ret.push_back(output_shape); - } - return ret; - } - - void Initialize(RModel& model) override { - - fVerbose = model.Verbose(); - if (model.CheckIfTensorAlreadyExist(fNData) == false) { - // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + " is not found in model"); - } - fShapeInput = model.GetTensorShape(fNData); - // check if optional shape tensor exist - if (!fNShape.empty()) { - if (model.CheckIfTensorAlreadyExist(fNShape)) { - auto dptr = model.GetInitializedTensorData(fNShape); - auto input_shape = static_cast(dptr.get()); - auto vec = model.GetTensorShape(fNShape); - assert(vec.size() == 1); - size_t n = vec[0]; // size of shape input tensor - - std::vector descShape(n); - std::copy(input_shape, input_shape + n, descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; - // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed - model.SetNotWritableInitializedTensor(fNShape); - } else { - throw std::runtime_error("TMVA Reshape Op Shape Tensor " + fNShape + " is not found in model"); - } - } else if (!fAttrAxes.empty()) { - // case fNShape is empty and axes are provided as attributes - std::vector descShape(fAttrAxes.size()); - std::copy(fAttrAxes.begin(), fAttrAxes.end(), descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; - } else if (fOpMode == Flatten || fOpMode == Squeeze) { - fShapeOutput = ShapeInference({fShapeInput})[0]; - } else { - throw std::runtime_error("TMVA Reshape Op : Invalid Input/Attribute data"); - } - // check if output is constant or not - if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { - fIsOutputConstant = true; - auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); - if (ConvertShapeToLength(fShapeInput) != ConvertShapeToLength(fShapeOutput)) - throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths"); - model.AddConstantTensor(fNOutput, fShapeOutput, inputData); - if (model.Verbose()) { - std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> " << fNOutput << " (constant) " << ConvertShapeToString(fShapeOutput) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeOutput), inputData) << std::endl; - } - } else { - // non-constant case - model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); - if (model.Verbose()) - std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> "<< fNOutput << " " << ConvertShapeToString(fShapeOutput) << std::endl; - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; //no op for constant tensors - - OpName = "op_" + OpName; - - // output of reshape is same as input - size_t length = ConvertShapeToLength(fShapeOutput); - if (length != ConvertShapeToLength(fShapeInput)) { - throw std::runtime_error("TMVA SOFIE Reshape Op : wrong output shape - is " + - ConvertShapeToString(fShapeOutput) + " and input is " + - ConvertShapeToString(fShapeInput)); - } - std::stringstream out; - std::string opName = "Reshape"; - if (fOpMode == Flatten) - opName = "Flatten"; - else if (fOpMode == Squeeze) - opName = "Squeeze"; - else if (fOpMode == Unsqueeze) - opName = "Unsquueze"; - - out << SP << "///--------" << opName << " operator\n" << std::endl; - out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << length << ", " << "tensor_" << fNOutput - << ");\n"; - return out.str(); - } -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_RESHAPE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx deleted file mode 100644 index 6951017..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx +++ /dev/null @@ -1,176 +0,0 @@ -#ifndef SOFIE_ROperator_ScatterElements -#define SOFIE_ROperator_ScatterElements - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - - -class ROperator_ScatterElements final : public ROperator{ -private: - - int64_t fAxis; - - std::string fNX; - std::string fNI; - std::string fNU; - std::string fNY; - std::string fReduction; - - std::vector fShapeX; - std::vector fShapeI; - std::vector fShapeY; - - // define reduction function. Possibilities are: - // none (default), add, mul, max, min - std::string ReductionFunction(const std::string & t1, const std::string & t2 ) { - std::string name = fReduction; - if (name.empty() || name == "none") - return t2; - else if (name == "add") - return t1 + " + " + t2; - else if (name == "mul") - return t1 + " * " + t2; - else if (name == "max") - return "std::max(" + t1 + "," + t2 + ")"; - else if (name == "min") - return "std::min(" + t1 + "," + t2 + ")"; - else - throw std::runtime_error("TMVA SOFIE ScatterElements : invalid reduction attribute"); - - return std::string(); - } - -public: - ROperator_ScatterElements(){} - ROperator_ScatterElements(const std::string & nameX, const std::string & nameI, const std::string & nameU, const std::string & nameY, - int axis, std::string reduction): - fAxis(axis), - fNX(UTILITY::Clean_name(nameX)), fNI(UTILITY::Clean_name(nameI)), fNU(UTILITY::Clean_name(nameU)), - fNY(UTILITY::Clean_name(nameY)), - fReduction(reduction) - { - fInputTensorNames = { fNX, fNI, fNU }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { - auto ret = std::vector>(1, input[0]); // return vector size 1 with first input - return ret; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNX)){ - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNX + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNI)) { - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNI + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNU)) { - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNU + "is not found in model"); - } - //tbd check for constant tensors - - fShapeX = model.GetTensorShape(fNX); - fShapeI = model.GetTensorShape(fNI); - if (model.GetTensorShape(fNU) != fShapeI) - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - update tensor has invalid shape ")) ; - if (fShapeX.size() == 0) - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - input tensor has zero rank ")) ; - if (fShapeX.size() != fShapeI.size()) - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - index tensor has invalid rank ")) ; - - if (fAxis < 0) fAxis += fShapeX.size(); - - // assume output shape is identical to input shape - fShapeY = fShapeX; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - } - - std::string GenerateInitCode() override { - std::stringstream out; - return out.str(); - } - - std::string Generate(std::string opName) override { - - if (fIsOutputConstant) return ""; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE ScatterElements Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//-------- ScatterElements --- " << opName << "\n"; - - auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); - auto strideI = UTILITY::ComputeStrideFromShape(fShapeI); - - size_t length = ConvertShapeToLength(fShapeY); - - // function to write compute expression for global index from axes indices - auto tensorIndex = [](const std::vector & stride, const std::vector & idx) { - std::stringstream strst; - int dims = idx.size(); - assert (dims == (int) stride.size()); - for (int i = 0; i < dims; i++) { - if (stride[i] != 1) - strst << stride[i] << "*" << idx[i]; - else - strst << idx[i]; - if (i < dims-1) - strst << " + "; - } - return strst.str(); - }; - - - // copy first input in output (maybe can be avoided??) - out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; - - // loop on tensor rank - int dims = fShapeY.size(); - std::vector idx(dims); - for (int i = 0; i < dims; i++) { - idx[i] = std::string("i") + std::to_string(i); - for (int j = 0; j <= i; j++) out << SP; - out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i] << "; " << idx[i] << "++) {\n"; - } - // correct index for specific axis - for (int j = 0; j <= dims; j++) out << SP; - out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n"; - for (int j = 0; j <= dims; j++) out << SP; - out << "int iAxis = tensor_" << fNI << "[updateIndex];\n"; - for (int j = 0; j <= dims; j++) out << SP; - out << "if (iAxis < 0) iAxis += " << fShapeY[fAxis] << ";\n"; - idx[fAxis] = "iAxis"; - for (int j = 0; j <= dims; j++) out << SP; - out << "int outIndex = " << tensorIndex(strideY, idx) << ";\n"; - for (int j = 0; j <= dims; j++) out << SP; - out << "tensor_" << fNY << "[outIndex] = " - << ReductionFunction(std::string("tensor_") + fNY + "[outIndex]", std::string("tensor_") + fNU + "[updateIndex]") << ";\n"; - - for (int i = dims; i > 0; i--) { - for (int j = 0; j < i; j++) out << SP; - out << "}\n"; - } - return out.str(); - } - -}; - -}//SOFIE - - -#endif //SOFIE_ROperator_ScatterElements diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx deleted file mode 100644 index 68edd01..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Sigmoid -#define SOFIE_ROPERATOR_Sigmoid - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - -namespace SOFIE{ - -template -class ROperator_Sigmoid final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - -public: - ROperator_Sigmoid(){} - ROperator_Sigmoid(std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Sigmoid Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - } - - - std::string Generate(std::string opName) override { - if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Sigmoid called to Generate without being initialized first"); - } - std::stringstream out; - int length = 1; - for(auto& i: fShape){ - length *= i; - } - out << "\n//------ Sigmoid -- " << opName << "\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = 1 / (1 + std::exp( - tensor_" << fNX << "[id]));\n"; - out << SP << "}\n"; - return out.str(); - } - - std::vector GetStdLibs() override { return { std::string("cmath") };} -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_Sigmoid diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx deleted file mode 100644 index 6d40003..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx +++ /dev/null @@ -1,263 +0,0 @@ -#ifndef SOFIE_ROPERATOR_SLICE -#define SOFIE_ROPERATOR_SLICE - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include -#include - - -namespace SOFIE{ - -// slice operator - -template -class ROperator_Slice final : public ROperator -{ - -private: - - std::string fNData; // input data tensor name - std::string fNOutput; // output data name - std::vector fNames; // tensor names for meta(axis) information - std::vector fShapeInput; // input shape data - std::vector fShapeOutput; // output shape data - // saved Start/End.Steps are corrected from initial ONNX for negative/default values - // and are available for each axis - std::vector fStart; // starting values of slices - std::vector fEnd; // End values of slices - std::vector fSteps; // step values of slices - - std::vector> fAttributes; // attributes for the version <=10 case - - -public: - - ROperator_Slice(){} - - // ctor for versions >= 10 - ROperator_Slice(std::string nameData, std::vector names, std::string nameOutput) - : fNData(UTILITY::Clean_name(nameData)), - fNOutput(UTILITY::Clean_name(nameOutput)) - { - fNames.resize(4); - // axes and steps can be optional - for (size_t i = 0; i < names.size(); ++i) { - fNames[i] = UTILITY::Clean_name(names[i]); - } - - fInputTensorNames = { fNData }; - fOutputTensorNames = { fNOutput }; - } - // ctor for versions < 10 - ROperator_Slice(std::string nameData, std::vector starts, std::vector ends, std::vector axes, std::string nameOutput) - : fNData(UTILITY::Clean_name(nameData)), - fNOutput(UTILITY::Clean_name(nameOutput)) - { - fAttributes.push_back(starts); - fAttributes.push_back(ends); - fAttributes.push_back(axes); - } - - // output type is same as input - std::vector TypeInference(std::vector input) override { - auto ret = std::vector(1, input[0]); - return ret; - } - - // output shape - std::vector> ShapeInference(std::vector> input) override { - auto & input_shape = input[0]; - // assume dimension of output shape is SAME AS INPUT ! - std::vector> ret(1, input_shape); - auto & output_shape = ret[0]; - for (size_t i = 0; i < input_shape.size(); i++) { - output_shape[i] = (fEnd[i]-fStart[i])/ fSteps[i]; - } - return ret; - } - - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNData) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA Slice Op Input Tensor is not found in model"); - } - - std::vector> shapes; - fShapeInput = model.GetTensorShape(fNData); - shapes.push_back(fShapeInput); - - std::vector> itensors(4); - if (fNames.size() > 0) { - // loop on the extra 2 or 3 or 4 inputs - for (size_t i = 0; i < fNames.size(); ++i) { - if (!fNames[i].empty()) { - // std::cout << " i " << i << " getting data for tensor " << fNames[i] << std::endl; - auto dptr = model.GetInitializedTensorData(fNames[i]); - auto tensor = static_cast(dptr.get()); - auto vec = model.GetTensorShape(fNames[i]); - assert(vec.size() == 1); - itensors[i] = std::vector(tensor, tensor + vec[0]); - } else { - switch (i) { - case 2: // missing axes - itensors[2] = std::vector(fShapeInput.size()); - std::iota(itensors[2].begin(), itensors[2].end(), 0); - break; - case 3: // missing steps - itensors[3] = std::vector(itensors[0].size(), 1); - default: break; - } - } - } - } else { - assert(fAttributes.size() > 1); - for (size_t i = 0; i < fAttributes.size(); i++) { - itensors[i] = fAttributes[i]; - } - } - size_t dim = fShapeInput.size(); - - fSteps = std::vector(dim, 1); - fStart = std::vector(dim, 0); - fEnd = std::vector(dim, 0); - std::copy(fShapeInput.begin(), fShapeInput.end(), fEnd.begin()); - - auto istart = itensors[0]; - auto iend = itensors[1]; - auto iaxes = itensors[2]; - auto isteps = itensors[3]; - - // make tensor axis - // if iaxes.size is =0 tensor axis is missing and use defaults - if (iaxes.size() > 0) { - for (size_t i = 0; i < iaxes.size(); i++) { - // negative axes - they count from the back - if (iaxes[i] < 0) iaxes[i] = dim + iaxes[i]; - if (iaxes[i] < 0 || iaxes[i] >= static_cast(dim)) - throw std::runtime_error("TMVA Slice Op : invalid axis value " + std::to_string(iaxes[i]) + - " for " + std::to_string(i)); - - size_t iAxisDim = fShapeInput[iaxes[i]]; - // find start/end/step for given axis - // check step size for clamping starting/end value - if (istart[i] < 0) istart[i] = iAxisDim + istart[i]; - if (iend[i] < 0) iend[i] = iAxisDim + iend[i]; - if (istart[i] < 0) istart[i] = 0; - if (isteps[i] > 0) { - if (istart[i] > static_cast(iAxisDim)) istart[i] = static_cast(iAxisDim); - if (iend[i] < 0) iend[i] = 0; - if (iend[i] > static_cast(iAxisDim)) iend[i] = static_cast(iAxisDim); - } else if (isteps[i] < 0) { - if (istart[i] > static_cast(iAxisDim)-1) istart[i] = static_cast(iAxisDim) -1; - if (iend[i] < -1) iend[i] = -1; - if (iend[i] > static_cast(iAxisDim)-1) iend[i] = static_cast(iAxisDim) -1; - } else { - throw std::runtime_error("TMVA Slice Op : invalid step value " + std::to_string(isteps[i]) + - " for " + std::to_string(i)); - } - fStart[iaxes[i]] = istart[i]; - fEnd[iaxes[i]] = iend[i]; - fSteps[iaxes[i]] = isteps[i]; - } - } - - fShapeOutput = ShapeInference({fShapeInput})[0]; - // case input is a constant tensor and of int64 type - if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { - fIsOutputConstant = true; - auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); - size_t outputSize = ConvertShapeToLength(fShapeOutput); - std::vector outputData(outputSize); - std::vector inputStride = UTILITY::ComputeStrideFromShape(fShapeInput); - // perform slice using a recursive function- need to use two lambda functions for this - auto sliceRecursive = [&](size_t iaxis, size_t & outIdx, size_t & inOffset) { - auto slice_impl = [&](size_t iax, size_t & outputIdx, size_t & inputOffset, auto & sliceRecImpl) { - // compute indices - std::vector indices; - for (IType i = fStart[iax]; (fSteps[iax] > 0) ? i < fEnd[iax] : i > fEnd[iax]; i += fSteps[iax] ) - indices.push_back(i); - if (iax == dim-1) { // last axis - for (size_t i = 0; i < indices.size(); i++) { - outputData[outputIdx] = inputData[inputOffset + indices[i]]; - outputIdx++; - } - return; - } else { - for (size_t i = 0; i < indices.size(); i++) { - size_t offset = inputOffset + inputStride[iax]*indices[i]; - sliceRecImpl(iax+1, outputIdx, offset,sliceRecImpl); - } - } - }; - slice_impl(iaxis, outIdx, inOffset,slice_impl); - }; - size_t idx = 0; - size_t offset = 0; - sliceRecursive(0, idx, offset); - - model.AddConstantTensor(fNOutput, fShapeOutput, outputData.data()); - if (model.Verbose()) { - std::cout << "Slice: output is a constant tensor " << ConvertShapeToString(fShapeOutput) << " : " - << ConvertValuesToString(outputData) << std::endl; - } - } - else { - model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); - if (model.Verbose()) { - std::cout << "Slice ---> " << fNOutput << " " << ConvertShapeToString(fShapeOutput) << std::endl; - } - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; //no op for constant tensors - - OpName = "op_" + OpName; - if (fShapeInput.empty() || fShapeOutput.empty()){ - throw std::runtime_error("TMVA SOFIE Slice Op called to Generate without being initialized first"); - } - - std::stringstream out; - //std::string opName = "Slice"; - - out << SP << "///------- Slice operator\n" << std::endl; - // loop on the dimensions depending no the orders - size_t ndim = fShapeInput.size(); - std::vector strides(ndim,1); - for (int i = int(ndim-2); i >=0 ; i--) { - strides[i] = strides[i+1]*fShapeInput[i+1]; - } - - out << SP << "{\n"; // define operator scope - out << SP << "size_t iOut = 0;\n"; - std::string MSP = SP; - for (size_t idim = 0; idim < ndim; idim++) { - out << MSP << "for (size_t i" << idim << " = " << fStart[idim] << "; i" << idim << " < " << fEnd[idim] - << "; i" << idim << "+= " << fSteps[idim] << ") {\n"; - MSP += SP; - if (idim < ndim-1) out << MSP << "size_t stride" << idim << " = " << strides[idim] << "*i" << idim << ";\n"; - } - out << MSP << "size_t iInput = "; - for (size_t idim = 0; idim < ndim-1; idim++) out << " stride" << idim << " + "; - // here should be step size ? - out << "i" << ndim-1 << ";\n"; - out << MSP << "tensor_" << fNOutput << "[iOut++] = tensor_" < - -namespace SOFIE { - -template -class ROperator_Softmax final : public ROperator { - -private: - int64_t fAttrAxis; - - std::string fNX; - std::string fNY; - std::vector fShape; - - std::string fType; - -public: - ROperator_Softmax() {} - ROperator_Softmax(int64_t attr_axis, std::string nameX, std::string nameY) - : fAttrAxis(attr_axis), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) - { - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { return input; } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; // suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == - false) { // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Softmax Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - fType = ConvertTypeToString(model.GetTensorType(fNX)); - if (model.Verbose()) { - std::cout << "Softmax -> " << fNY << " " << ConvertShapeToString(fShape) << std::endl; - } - } - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Softmax called to Generate without being initialized first"); - } - std::stringstream out; - size_t size = fShape.size(); - size_t length = ConvertShapeToLength(fShape); - size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis; - out << "\n" << SP << "//------ SOFTMAX - " << size << " " << length << " " << axis << "\n"; - // use safe numerically implementation by subtracting max of tensor - if (size == 1) { - out << SP << fType << " vmax = tensor_" << fNX << "[0];\n"; - out << SP << "for (size_t i = 1; i < " << length << " ; i++){\n"; - out << SP << SP << "if (tensor_" << fNX << "[i] > vmax) vmax = tensor_" << fNX << "[i];\n"; - out << SP << "}\n"; - out << SP << fType << " sum = 0.0;\n"; - out << SP << "for (size_t i = 0; i < " << length << " ; i++){\n"; - out << SP << SP << "tensor_" << fNY << "[i] = std::exp(tensor_" << fNX << "[i] - vmax);\n"; - out << SP << SP << "sum += tensor_" << fNY << "[i];\n"; - out << SP << "}\n"; - out << SP << "for (size_t i = 0; i < " << length << " ; i++){\n"; - out << SP << SP << "tensor_" << fNY << "[i] /= sum;\n"; - out << SP << "}\n"; - } else { - size_t batch = fShape[0]; - size_t channel = fShape[1]; - size_t width = (size > 2) ? fShape[size - 1] : 1; - size_t height = (size > 3) ? fShape[size - 2] : 1; - size_t depth = (size > 4) ? fShape[size - 3] : 1; - size_t hStride = width; - size_t dStride = height * width; - size_t cStride = depth * dStride; - size_t bStride = channel * cStride; - - size_t N = 0; // Size of the axis - size_t iStride = 0; - if (axis == 0) { - N = batch; - iStride = bStride; - } else if (axis == 1) { - N = channel; - iStride = cStride; - } else if (axis == size - 1) { - N = width; - iStride = 1; - } else if (size > 3 && axis == size - 2) { - N = height; - iStride = hStride; - } else if (size == 5 && axis == size - 3) { - N = depth; - iStride = dStride; - } else { - throw - std::runtime_error("TMVA::SOFIE - Softmax operator along the axis " - + std::to_string(fAttrAxis) + " with " + std::to_string(size) - + "d input tensor not supported."); - } - - bool notBatch = axis != 0; - bool notChannel = axis != 1; - bool notDepth = (size == 5 && axis != 2); - bool notHeight = (size == 5 && axis != 3) || (size == 4 && axis != 2); - bool notWidth = (size == 5 && axis != 4) || (size == 4 && axis != 3) || (size == 3 && axis != 2); - - if (notBatch) { - out << SP << "for (size_t n = 0; n < " << batch << " ; n++){\n"; - } - if (notChannel) { - out << SP << SP << "for (size_t c = 0; c < " << channel << " ; c++){\n"; - } - if (notDepth) { - out << SP << SP << "for (size_t d = 0; d < " << depth << " ; d++){\n"; - } - if (notHeight) { - out << SP << SP << "for (size_t h = 0; h < " << height << " ; h++){\n"; - } - if (notWidth) { - out << SP << SP << "for (size_t w = 0; w < " << width << " ; w++){\n"; - } - out << SP << SP << SP << fType << " sum = 0.;\n"; - out << SP << SP << SP << "size_t index = 0"; - if (notBatch) { - out << " + n * " << bStride; - } - if (notChannel) { - out << "+ c * " << cStride; - } - if (notDepth) { - out << " + d * " << dStride; - } - if (notHeight) { - out << " + h * " << hStride; - } - if (notWidth) { - out << " + w"; - } - out << ";\n"; - // apply softmax along the axis - find first maximum value for numerical stability - if (N == 0) - throw std::runtime_error("TMVA::SOFIE - Softmax operator is along axis with zero elements"); - out << SP << SP << SP << fType << " vmax = tensor_" << fNX << "[index];\n"; - out << SP << SP << SP << "for (size_t i = 1; i < " << N << "; i++) {\n"; - out << SP << SP << SP << SP << "if (tensor_" << fNX << "[index + i*" << iStride << "] > vmax)\n"; - out << SP << SP << SP << SP << SP << "vmax = tensor_" << fNX << "[index + i*" << iStride << "];\n"; - out << SP << SP << SP << "}\n"; - out << SP << SP << SP << "for (size_t i = 0; i < " << N << "; i++) {\n"; - out << SP << SP << SP << SP << "tensor_" << fNY << "[index + i*" << iStride << "] = std::exp(tensor_" << fNX - << "[index + i*" << iStride << "] - vmax);\n"; - out << SP << SP << SP << SP << "sum += tensor_" << fNY << "[index + i*" << iStride << "];\n"; - out << SP << SP << SP << "}\n"; - out << SP << SP << SP << "for (size_t i = 0; i < " << N << "; i++) {\n"; - out << SP << SP << SP << SP << "tensor_" << fNY << "[index + i*" << iStride << "] /= sum;\n"; - out << SP << SP << SP << "}\n"; - if (notWidth) { - out << SP << SP << "}\n"; // end w - } - if (notHeight) { - out << SP << SP << "}\n"; // end h - } - if (notDepth) { - out << SP << SP << "}\n"; // end d - } - if (notChannel) { - out << SP << SP << "}\n"; // end c - } - if (notBatch) { - out << SP << "}\n"; // end n - } - } - return out.str(); - } -}; - -} // namespace SOFIE - -#endif // SOFIE_ROPERATOR_Softmax diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx deleted file mode 100644 index 37c92ee..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Tanh -#define SOFIE_ROPERATOR_Tanh - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Tanh final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - -public: - ROperator_Tanh(){} - ROperator_Tanh(std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - //input must be a graph input, or already initialized intermediate tensor - if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE Tanh Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - - } - - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Tanh operator called to Generate without being initialized first"); - } - std::stringstream out; - size_t length = ConvertShapeToLength(fShape); - out << "\n//------ TANH\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = std::tanh(tensor_" << fNX << "[id]);\n"; - out << SP << "}\n"; - return out.str(); - } - - std::vector GetStdLibs() override { return { std::string("cmath") };} -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_Tanh diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx deleted file mode 100644 index 354fbe3..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Tile -#define SOFIE_ROPERATOR_Tile - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Tile final : public ROperator -{ - -private: - - std::string fNRepeats; - std::string fNInput; - std::string fNY; - std::vectorfShapeInput; - std::vector fShapeY; - -public: - ROperator_Tile(){} - ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY): - fNRepeats(UTILITY::Clean_name(nameRepeat)),fNInput(UTILITY::Clean_name(nameInput)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNRepeats, fNInput }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - std::vector ret = input[0]; - - for(size_t i=0; i < input[1].size(); i++) { - ret[i]=ret[i]*input[1][i]; - } - return {ret}; - } - - void Initialize(RModel& model) override { - //input must be a graph input, or already initialized intermediate tensor - if (model.CheckIfTensorAlreadyExist(fNInput) == false){ - throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); - } - if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){ - throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); - } - fShapeInput=model.GetTensorShape(fNInput); - - // if repeats vector is not initialized we cannot deduce shape of output - // not support for time being this case - if (!model.IsInitializedTensor(fNRepeats)) { - throw std::runtime_error("TMVA SOFIE Tile Op: non-initialized repeats input is not supported"); - } - - // Retrieve the data pointer for the repeats tensor - auto repptr = model.GetInitializedTensorData(fNRepeats); - // Cast the raw pointer to the appropriate type (size_t*) - auto repeats_data = static_cast(repptr.get()); - if (repeats_data == nullptr) { - throw std::runtime_error("Failed to retrieve the data for the repeats tensor."); - } - // Get the shape of the repeats tensor to determine the number of elements - auto repeats_shape = model.GetTensorShape(fNRepeats); - // Ensure the repeats tensor is 1D and get the number of elements - if (repeats_shape.size() != 1) { - throw std::runtime_error("Repeats tensor is not 1D."); - } - size_t num_elements = repeats_shape[0]; - // Convert the data to a vector of size_t - std::vector repeats_vector(num_elements); - std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin()); - - - fShapeY = ShapeInference({fShapeInput,repeats_vector})[0]; - - model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY); - - if (model.Verbose()) - std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl; - } - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShapeInput.empty() || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Tile Op called to Generate without being initialized first"); - } - - //size_t input_length = ConvertShapeToLength(fShapeInput); - //size_t output_length = ConvertShapeToLength(fShapeY); - - - std::stringstream out; - std::string input = "tensor_" + fNInput; - std::string output = "tensor_" + fNY; - out << "///-------- Tile operator\n"; - out << "{\n"; // add scope to re-use same names - out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n"; - - out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n"; - out << "int s = 1;\n"; - // loop from inverse dim order - out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n"; - out << SP << "int r = tensor_" << fNRepeats << "[i];\n"; - // we cannot exclude case where repeats=1 since we need offset - //out << SP << "if (r == 1 && i < " << fShapeInput.size()-1 << ") continue;\n"; - out << SP << "int i_offset = 0, o_offset = 0;\n"; - out << SP << "s = s * input_shape[i];\n"; - // case we have first copy - out << SP << "if (i == " << fShapeInput.size()-1 << ") {\n"; - out << SP << SP << "for (int j = 0; j < inputLength/s ; j++) {\n"; - out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n"; - out << SP << SP << SP << SP << "std::copy(" << input << "+ i_offset, " - << input << "+ i_offset + s, " << output << "+ o_offset);\n"; - out << SP << SP << SP << SP << "o_offset += s;\n"; - out << SP << SP << SP << "}\n"; // end k loop - out << SP << SP << SP << "i_offset += s;\n"; - out << SP << SP << "}\n"; // end j loop - out << SP << "} else {\n"; // second copy we do from output to output - // and we need to loop on j from reverse order to avoir re-writing in output tensor - out << SP << SP << "for (int j = inputLength/s - 1 ; j>=0; j--) {\n"; - out << SP << SP << SP << "o_offset = j*s*r;\n"; - out << SP << SP << SP << "i_offset = j*s;\n"; - out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n"; - out << SP << SP << SP << SP << "std::copy(" << output << "+ i_offset, " - << output << "+ i_offset + s, " << output << "+ o_offset);\n"; - out << SP << SP << SP << SP << "o_offset += s;\n"; - out << SP << SP << SP << "}\n"; // end k loop - out << SP << SP << "}\n"; // end j loop - out << SP << "}\n"; // end if - out << SP << "s *= r;\n"; - out << SP << "inputLength *= r;\n"; - out << "}\n"; // end i loop - out << "}\n"; // end of scope - return out.str(); - } -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_Tile diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx deleted file mode 100644 index 28ac093..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx +++ /dev/null @@ -1,243 +0,0 @@ -#ifndef SOFIE_ROperator_Where -#define SOFIE_ROperator_Where - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - - - -template -class ROperator_Where final : public ROperator{ -private: - - bool fIsInputBoolTensor = false; - - - std::string fNA; - std::string fNB; - std::string fNC; - std::string fNBroadcastedA; - std::string fNBroadcastedB; - std::string fNBroadcastedC; - std::string fNY; - - - std::vector fShapeA; - std::vector fShapeB; - std::vector fShapeC; - std::vector fShapeY; - - -public: - ROperator_Where(){} - ROperator_Where(const std::string & nameA, const std::string & nameB, const std::string & nameC, const std::string & nameY): - fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNA, fNB, fNC }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { - // assume now inputs have same shape (no broadcasting) - auto ret = std::vector>(1, input[0]); // return vector size 1 with first input - return ret; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNA)){ - throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNA + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNB + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNC)) { - throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNC + "is not found in model"); - } - // check if fNC input tensor is boolean - if (model.IsReadyInputTensor(fNC)) - fIsInputBoolTensor = true; - // check broadcast for A, B and C - fShapeA = model.GetTensorShape(fNA); - fShapeB = model.GetTensorShape(fNB); - fShapeC = model.GetTensorShape(fNC); - bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB) || !UTILITY::AreSameShape(fShapeA, fShapeC); - if (broadcast) { - // find shape to broadcast between A,B,C looking for max length - size_t lengthA = ConvertShapeToLength(fShapeA); - size_t lengthB = ConvertShapeToLength(fShapeB); - size_t lengthC = ConvertShapeToLength(fShapeC); - bool broadcastA = false, broadcastB = false, broadcastC = false; - if (lengthA >= lengthB && lengthA >= lengthC) { - fShapeY = fShapeA; - //broadcast B and C if different than A - broadcastB = (lengthB != lengthA); - broadcastC = (lengthC != lengthA); - } - else if (lengthB >= lengthA && lengthB >= lengthC) { - fShapeY = fShapeB; - //broadcast A and C if different than B - broadcastA = (lengthA != lengthB); - broadcastC = (lengthC != lengthB); - } - else if (lengthC >= lengthA && lengthC >= lengthB) { - fShapeY = fShapeC; - //broadcast A and B if different than C - broadcastA = (lengthA != lengthC); - broadcastB = (lengthB != lengthC); - } - - // Broadcast A to Y - if (broadcastA) { - fNBroadcastedA = "BC_" + fNA + "_to_" + fNY; - if (model.IsInitializedTensor(fNA)) { - auto data = model.GetInitializedTensorData(fNA); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), - std::default_delete()); - // Update the data and the shape of A - model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); - fShapeA = fShapeY; - } else { - // Add an intermediate tensor for broadcasting A - model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY); - } - } - // Broadcast B to Y - if (broadcastB) { - fNBroadcastedB = "BC_" + fNB + "_to_" + fNY; - if (model.IsInitializedTensor(fNB)) { - auto data = model.GetInitializedTensorData(fNB); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), - std::default_delete()); - // do not update tensor B but add broadcasted one (since it can be input to some other operators) - model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); - fShapeB = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY); - } - } - // Broadcast C to Y - if (broadcastC) { - fNBroadcastedC = "BC_" + fNC + "_to_" + fNY; - if (model.IsInitializedTensor(fNC)) { - auto data = model.GetInitializedTensorData(fNC); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeC, fShapeY), - std::default_delete()); - // do not update tensor C but add broadcasted one (since it can be input to some other operators) - model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY, broadcastedData); - fShapeC = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY); - } - } - } else { - fShapeY = fShapeA; - } - // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB) && model.IsInitializedTensor(fNC)) { - std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); - auto dataC = static_cast(model.GetInitializedTensorData(nameC).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) - dataY[i] = (dataC[i]) ? dataA[i] : dataB[i]; - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a file - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); - model.SetNotWritableInitializedTensor(nameC); - - fIsOutputConstant = true; - if (model.Verbose()) - std::cout << "Where op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - - // output is a constant tensor - fOutputTensorNames.pop_back(); - } - else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); - } - } - - std::string GenerateInitCode() override { - std::stringstream out; - return out.str(); - } - - std::string Generate(std::string OpName) override { - - if (fIsOutputConstant) return ""; - - OpName = "op_" + OpName; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Where Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//-------- Where \n"; - size_t length = ConvertShapeToLength(fShapeY); - std::string typeName = TensorType::Name(); - // Broadcast A if it's uninitialized - if (fShapeA != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n"; - //out << SP << "{\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedA << ");\n"; - } - // Broadcast B if it's uninitialized - if (fShapeB != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n"; - //out << SP << "{\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedB << ");\n"; - } - // Broadcast C if it's uninitialized - if (fShapeC != fShapeY) { - // special case if C is an input tensor - if (fIsInputBoolTensor) { - size_t inputLength = ConvertShapeToLength(fShapeC); - out << SP << "std::vector fTensor_" << fNC << "(tensor_" << fNC << ", tensor_" << fNC << " + " << inputLength << ");\n"; - } - out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n"; - //out << SP << "{\n"; - // for boolean we need to pass vector and use the non-template version of the function - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ", " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedC << ");\n"; - } - std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - // get output tensor applying condition (note we need to use directly the vector since v.data(), i.e the data pointer, does not exist) - out << SP << SP << "tensor_" << fNY << "[id] = " << "(fTensor_" << nameC << "[id]) ? tensor_" - << nameA << "[id] : tensor_" + nameB + "[id];\n"; - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - - -#endif //SOFIE_ROperator_Where diff --git a/src/SOFIE_core/src/RModel.cxx b/src/SOFIE_core/src/RModel.cxx deleted file mode 100644 index e5495ed..0000000 --- a/src/SOFIE_core/src/RModel.cxx +++ /dev/null @@ -1,1327 +0,0 @@ -#include -#include -#include -#include -#include - -#include "TFile.h" - -#include "SOFIE/RModel.hxx" -#include "SOFIE/SOFIE_common.hxx" - - -namespace SOFIE { - -std::underlying_type_t operator|(Options opA, Options opB) { - return static_cast>(opA) | static_cast>(opB); -} -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { - return opA | static_cast>(opB); -} - -RModel::RModel(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; -} - -RModel& RModel::operator=(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; - return *this; -} - -const std::vector& RModel::GetTensorShape(std::string name) const { - auto f = fReadyInputTensorInfos.find(name); - if (f != fReadyInputTensorInfos.end()) { - return f->second.shape; - } - auto f2 = fInitializedTensors.find(name); - if (f2 != fInitializedTensors.end()) { - return f2->second.shape(); - } - auto f3 = fInputTensorInfos.find(name); - if (f3 != fInputTensorInfos.end()) { - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is an input tensor with unspecified dimension parameter"); - } - auto f4 = fIntermediateTensorInfos.find(name); - if (f4 != fIntermediateTensorInfos.end()) { - return f4->second.shape; - } - if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape"); - - if (fIsSubGraph && fParentGraph) - return fParentGraph->GetTensorShape(name); - - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found"); -} - -std::vector RModel::GetDynamicTensorShape(std::string name) const { - if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { - return f->second.shape; - } - if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { - return f->second.shape; - } - // in case is not a dynamic tensor convert normal shape to Dim one - // for this we need to return the vector by value - return ConvertShapeToDim(GetTensorShape(name)); -} - -const ETensorType& RModel::GetTensorType(std::string name) const { - auto f = fReadyInputTensorInfos.find(name); - if (f != fReadyInputTensorInfos.end()) { - return f->second.type; - } - auto f2 = fInitializedTensors.find(name); - if (f2 != fInitializedTensors.end()) { - return f2->second.type(); - } - auto f3 = fInputTensorInfos.find(name); - if (f3 != fInputTensorInfos.end()) { - return f3->second.type; - } - auto f4 = fIntermediateTensorInfos.find(name); - if (f4 != fIntermediateTensorInfos.end()) { - return f4->second.type; - } - auto f5 = fDynamicTensorInfos.find(name); - if (f5 != fDynamicTensorInfos.end()){ - return f5->second.type; - } - - if (fIsSubGraph && fParentGraph) - return fParentGraph->GetTensorType(name); - - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the type is requested is not found, model name: " + fName); -} - -bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) { - if (fReadyInputTensorInfos.find(tensor_name) != fReadyInputTensorInfos.end()) return true; - if (fInputTensorInfos.find(tensor_name) != fInputTensorInfos.end()) return true; - if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true; - if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true; - if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true; - if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name); - return false; -} - -void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { - input_name = UTILITY::Clean_name(input_name); - if (CheckIfTensorAlreadyExist(input_name)) { - throw std::runtime_error("TMVA-SOFIE: input tensor with name " + input_name + " already exists \n"); - } - - InputTensorInfo inputInfo { type, shape }; - fInputTensorInfos[input_name] = inputInfo; -} - -void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { - input_name = UTILITY::Clean_name(input_name); - if (CheckIfTensorAlreadyExist(input_name)) { - throw std::runtime_error("TMVA-SOFIE: input tensor with name " + input_name + " already exists \n"); - } - TensorInfo inputInfo { type, shape }; - fReadyInputTensorInfos[input_name] = inputInfo; -} - -void RModel::AddInputTensorName(std::string input_name) { - fInputTensorNames.emplace_back(UTILITY::Clean_name(input_name)); -} - -void RModel::AddOperator(std::unique_ptr op, int order_execution) { - AddBlasRoutines(op->GetBlasRoutines()); - auto libs = op->GetStdLibs(); - auto op_input_tensors = op->GetOpInputTensors(); - for (auto& stdlib : libs) { - AddNeededStdLib(stdlib); - } - if (order_execution >= 0) { - fOperators.insert(fOperators.begin() + order_execution, std::move(op)); - } else { - fOperators.push_back(std::move(op)); - } - - // storing the last usage of tensors which are input to - // operators (but are not inputs to the model, i.e. they are intermediate - // tensors). This information is needed to keep a check on when a - // particular intermediate tensor can be flushed to free up memory for reuse. - for(size_t index = 0; index shape, std::shared_ptr data) { - tensor_name = UTILITY::Clean_name(tensor_name); - //NB: own data - if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); - } - InitializedTensor new_tensor {type, shape, data}; - fInitializedTensors[tensor_name] = new_tensor; -} - -void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { - tensor_name = UTILITY::Clean_name(tensor_name); - //NB: own data - if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); - } - InitializedTensor new_tensor {type, shape, data, true}; // add here flag to specify is a constant tensor - fInitializedTensors[tensor_name] = new_tensor; -} - -bool RModel::IsInitializedTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fInitializedTensors.find(name) != fInitializedTensors.end(); -} -bool RModel::IsConstantTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - auto itr = fInitializedTensors.find(name); - if (itr == fInitializedTensors.end()) return false; - return itr->second.IsConstantTensor(); -} - -bool RModel::IsDynamicTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); -} -bool RModel::IsDimInputTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fInputTensorInfos.find(name) != fInputTensorInfos.end(); -} -bool RModel::IsReadyInputTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fReadyInputTensorInfos.find(name) != fReadyInputTensorInfos.end(); -} - -// generic addition of a tensor -void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape) { - auto int_shape = ConvertShapeToInt(dim_shape); - if (!int_shape.empty()) - AddIntermediateTensor(tensor_name, type, int_shape); - else - AddDynamicTensor(tensor_name, type, dim_shape); -} - -void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape) { - tensor_name = UTILITY::Clean_name(tensor_name); - if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n"); - } - TensorInfo new_tensor {type, shape}; - fIntermediateTensorInfos[tensor_name] = new_tensor; -} - -void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape){ - tensor_name = UTILITY::Clean_name(tensor_name); - if (CheckIfTensorAlreadyExist(tensor_name)){ - throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n"); - } - DynamicTensorInfo new_tensor {type, shape}; - fDynamicTensorInfos[tensor_name] = new_tensor; - // store shape parameter if not existing - for (auto &d : shape) { - if (d.isParam) { - if (fShapeParams.count(d.param) == 0) { - // case parameter is an expression of some other existing parameter, no need to - // register it - if (d.dim != size_t(-1)) { - fShapeParams[d.param] = std::to_string(d.dim); - } - } - } - } -} - -void RModel::AddOutputTensorNameList(std::vector outputtensornames) { - fOutputTensorNames.clear(); - for(auto& it : outputtensornames) { - fOutputTensorNames.emplace_back(UTILITY::Clean_name(it)); - } -} - -void RModel::UpdateOutputTensorList(std::vector curr_output_tensors, std::vector new_output_tensors) { - for(auto& it:curr_output_tensors) { - fOutputTensorNames.erase(std::remove(fOutputTensorNames.begin(), fOutputTensorNames.end(), it), fOutputTensorNames.end()); - } - fOutputTensorNames.insert(fOutputTensorNames.end(), new_output_tensors.begin(), new_output_tensors.end()); -} - -void RModel::UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { - tensor_name = UTILITY::Clean_name(tensor_name); - if (!CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to update it"); - } - InitializedTensor new_tensor {type, shape, data}; - fInitializedTensors[tensor_name] = new_tensor; -} - -std::shared_ptr RModel::GetInitializedTensorData(std::string tensor_name) { - auto f = fInitializedTensors.find(tensor_name); - if (f == fInitializedTensors.end()) { - throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to get its data"); - } else { - return f->second.sharedptr(); - } -} - -void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) { - auto t = fInitializedTensors.find(tensor_name); - if (t == fInitializedTensors.end()) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor " + tensor_name + " not found when trying to get its info"); - } - t->second.SetNotWritable(); - } - -std::string RModel:: AllocateIntermediateMemory(std::span op_output_tensors) { - - std::string memory_allocation_string = ""; - bool allocated; - - for (auto& it : op_output_tensors) { - allocated = false; - if (GetTensorType(std::string(it)) == ETensorType::BOOL || - fInitializedTensors.find(std::string(it)) != fInitializedTensors.end() || - fDynamicTensorInfos.find(std::string(it)) != fDynamicTensorInfos.end()) continue; - - auto tensor_size = GetTypeSize(GetTensorType(std::string(it))) * ConvertShapeToLength(GetTensorShape(std::string(it))); - memory_allocation_string += "\n // Allocating memory for intermediate tensor " + std::string(it) + " with size " + std::to_string(tensor_size) + " bytes"; - - for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) { - - // check if available memory chunks can accommodate the tensor - if (chunk->second >= tensor_size) { - auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size); - auto new_chunk_location = chunk->first+chunk->second-tensor_size; - fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; - - memory_allocation_string += "\n" + ConvertTypeToString(GetTensorType(std::string(it))) + - "* tensor_" + std::string(it) + - " = reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(new_chunk_location) + ");\n"; - chunk->second -= tensor_size; - - allocated = true; - - if (chunk->second == 0) { - chunk = fIntermediateMemoryInfo.available_stack.erase(chunk); - } - - break; - } - ++chunk; - } - - if (!allocated) { - size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() - ? 0 - : fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; - - fIntermediateMemoryInfo.total_stack[chunk_idx] = - { - it, - tensor_size - }; - - memory_allocation_string += "\n"+ConvertTypeToString(GetTensorType(std::string(it)))+"* tensor_"+ std::string(it) + "= reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(chunk_idx) + ");\n"; - } - } - return memory_allocation_string; -} - -void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ - for (auto &it : op_input_tensors){ - // last occurence of the tensor is reached => flush it from memory - if (fIntermediateTensorFrequencyLookup[it] == op_idx) { - for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); - chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) { - if (chunk->second.tensor_name == it) { - - // check if nearby chunks in available memory can coalesce - auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx - auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx - - // check if the next stack entry is actually adjacent in memory - if (last_smaller->first+last_smaller->second + 1 == chunk->first){ - last_smaller->second += chunk->second.tensor_size; - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); - - if (last_smaller->first + last_smaller->second + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - } else{ - if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - fIntermediateMemoryInfo.available_stack.insert({ - chunk->first, - chunk->second.tensor_size - }); - } - } - } - } - } -} - - - -void RModel::Initialize(int batchSize, bool verbose) { - std::map inputParams; - if (batchSize > 0) { - inputParams["input_size"] = batchSize; - inputParams["batch_size"] = batchSize; - inputParams["bs"] = batchSize; - } - Initialize(inputParams, verbose); - fIntermediateMemoryInfo = MemoryPoolInfo(); -} -void RModel::Initialize(const std::map & inputParams, bool verbose) { - - fVerbose = int(verbose); - - if (fIsInitialized) { - if (verbose) - std::cout << "Model is already initialized - skip initialization " << std::endl; - return; - } - fIntermediateTensorInfos.clear(); - fDynamicTensorInfos.clear(); - - // loop on inputs and see if shape can be full specified - // if the batch size is provided it can be used to specify the full shape - // Add the full specified tensors in fReadyInputTensors collection - auto originalInputTensorInfos = fInputTensorInfos; // need to copy because we may delete elements - for (auto &input : originalInputTensorInfos) { - if (verbose) std::cout << "looking at the tensor " << input.first << std::endl; - // if a parameter (e.g. batch_size) is specified use for converting parametric shape in defined one - if (!inputParams.empty()) { - for (auto &d : input.second.shape) { - if (d.isParam) { - std::string pname = d.param; - if (pname == input.first + "_size") pname = "input_size"; - auto itr = inputParams.find(pname); - if (itr != inputParams.end() ) { - d = Dim{ itr->second }; - if (verbose) - std::cout << "Tensor: " << input.first << " - fix parametric shape " << itr->first << " to " << itr->second << std::endl; - } - } - } - } - // see if shape now is fully defined - auto shape = ConvertShapeToInt(input.second.shape); - if (verbose) - std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from " - << ConvertDynamicShapeToString(input.second.shape) << std::endl; - if (!shape.empty()) { - // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and - // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored - fInputTensorInfos.erase(input.first); - // add to the ready input tensor information the new fixed shape - AddInputTensorInfo(input.first, input.second.type, shape); - // check consistency - assert( fReadyInputTensorInfos.size() + fInputTensorInfos.size() == fInputTensorNames.size()); - } - // store the parameters of the input tensors - else { - // store the found parametric shape parameters - for (auto &d : input.second.shape) { - if (d.isParam) - fShapeParams[d.param] = std::to_string(d.dim); - } - } - } - - if (verbose) { - PrintRequiredInputTensors(); - PrintDynamicTensors(); - } - - // check if there are initialized tensors to write in a weight file - // support for the time being only weight of FLOAT type - if (fUseWeightFile) { - bool modelHasWeights = false; - for (auto &i : fInitializedTensors) { - if (i.second.type() == ETensorType::FLOAT) { - modelHasWeights = true; - break; - } - } - if (!modelHasWeights) - fUseWeightFile = false; - } - // Go through model and initialize each operator - int i = 0; - - std::vector temp_available_stack; // vector stores individual chunks of available memory that maybe reused - - for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){ - if (verbose) { - auto& r = *fOperators[op_idx].get(); - std::cout << "Initializing operator " << i << " " << typeid(r).name() << std::endl; - } - fOperators[op_idx]->Initialize(*this); - for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ - if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && - std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), std::string(it)) == fOutputTensorNames.end() && - fInitializedTensors.find(std::string(it)) == fInitializedTensors.end() && - fDynamicTensorInfos.find(std::string(it)) == fDynamicTensorInfos.end()){ - fIntermediateTensorFrequencyLookup[it] = op_idx; - } - } - i++; - } - - fIsInitialized = true; -} - -void RModel::InitializeSubGraph(std::shared_ptr graph) { - // add the subgraph to the list - fSubGraphs.push_back(graph); - //this needs to be done before initializing - graph->fParentGraph = this; - graph->fIsSubGraph = true; - - graph->Initialize(fBatchSize, fVerbose); - // set the same options as parent model - graph->fWeightFile = fWeightFile; - graph->fUseWeightFile = fUseWeightFile; - graph->fUseSession = fUseSession; - // add needed blas routines and libs - std::vector blasRoutines; - for (auto & e : graph->fNeededBlasRoutines) - blasRoutines.push_back(e); - AddBlasRoutines(blasRoutines); - for (auto e : graph->fNeededStdLib) - AddNeededStdLib(e); - - // add parent input tensors to current graph - for (auto & name : fInputTensorNames) - graph->fInputTensorNames.emplace_back(name); - - // clean graph name - graph->fName = UTILITY::Clean_name(graph->fName); - -} - -// Function to generate the code for declaring and initializing constant tensors -// This is for tensors which are not part of weight files and can be created from the Constant operator -template -std::string GenerateConstantTensorCode(const std::pair &t) -{ - std::stringstream strs; - std::string type = ConvertTypeToString(t.second.type()); - size_t length = ConvertShapeToLength(t.second.shape()); - // avoid using stack sizes for constant tensors to reduce compilation time - bool allocateOnStack = (length > 100) ? false : true; - - const T *data = t.second.data(); - - // and check if all values are the same - bool sameData = false; - // for non stack allocation check if data are the same - if (!allocateOnStack && length > 1) { - size_t idx = 1; - do { - sameData = (data[idx] == data[idx - 1]); - idx++; - } while (sameData && idx < length); - } - if (allocateOnStack) { - strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; - } else { - strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; - if (sameData) - strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; - else { - strs << ConvertValuesToString(length, data) << ";\n"; - } - strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; - } - return strs.str(); -} - -void RModel::GenerateInitializedTensorInfo() -{ - if (!fInitializedTensors.empty()) - fGC += "// initialized tensors\n"; - - for (auto &i : fInitializedTensors) { - if (!fUseWeightFile || i.second.IsConstantTensor()) { - if (i.second.type() == ETensorType::FLOAT) - fGC += GenerateConstantTensorCode(i); - else if (i.second.type() == ETensorType::INT64) - fGC += GenerateConstantTensorCode(i); - - } else { - // case of tensors which are read from a file - size_t length = ConvertShapeToLength(i.second.shape()); - if (i.second.type() == ETensorType::FLOAT) { - fGC += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - } - } -} - -void RModel::GenerateIntermediateMemoryPool() { - if (fIntermediateMemoryInfo.total_stack.size() == 0) return; - fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n"; - - // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors - // of other data types - fGC += "char* fIntermediateMemoryPool = new char[" + std::to_string(fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size)+ "];\n\n"; -} - -void RModel::GenerateIntermediateTensorInfo() { - if (!fIntermediateTensorInfos.empty()) { - std::string tensor_declaration_block = ""; - - for (auto &i : fIntermediateTensorInfos) { - if (i.second.type == ETensorType::BOOL) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; - // No pointer allocation needed for BOOL - } - if (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()) { - size_t length = ConvertShapeToLength(i.second.shape); - - if (i.second.type == ETensorType::FLOAT) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - else if (i.second.type == ETensorType::DOUBLE) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - else if (i.second.type == ETensorType::INT64) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - } - } - - if (tensor_declaration_block.length()) { - fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; - } - } - // add also the dynamic tensors (only declarations, allocation will be done later) - if (!fDynamicTensorInfos.empty()) { - fGC += "//--- declare the dynamic tensors\n"; - for (auto &i : fDynamicTensorInfos) { - if (i.second.type == ETensorType::FLOAT) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "float * tensor_" + i.first + " = nullptr;\n"; - } else if (i.second.type == ETensorType::DOUBLE) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "double * tensor_" + i.first + " = nullptr;\n"; - } else if (i.second.type == ETensorType::INT64) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "int64_t * tensor_" + i.first + " = nullptr;\n"; - } - } - } -} - -// generate code for specific operator declarations to be defined in the Session class -void RModel::GenerateOperatorDeclarations() { - std::string strcode; - for (auto & op : fOperators) { - strcode += op->GenerateDeclCode(); - } - if (strcode.empty()) return; - fGC += "\n//---- operator declarations \n"; - fGC += strcode; - fGC += "\n"; -} - -void RModel::GenerateDynamicTensorInfo() { - fGC += "//---- allocate the intermediate dynamic tensors\n"; - std::stringstream out; - for (auto & i: fDynamicTensorInfos) { - auto length = ConvertDynamicShapeToLength(i.second.shape); - out << SP << "if (" << length << " > 0) {\n"; - out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; - out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; - out << SP << "}\n"; - } - fGC += out.str(); -} - -std::string RModel::GenerateInferSignature(bool isdecl) { - // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2" - // if (decl = false) generate only calling signature (tensor1,tensor2,....) - std::string rGC; - std::unordered_map inputParams; - int i_input = 0; - for (auto &name : fInputTensorNames) { - // if is a dynamic tensor pass initial parameters - if (IsDimInputTensor(name)) { - auto shape = GetDynamicTensorShape(name); - for (auto &d : shape) { - std::string pName = d.param; - // need to check if the input parameters is already existing in another input tensor - if (d.isParam && inputParams.count(pName) == 0) { - if (isdecl) rGC += "size_t "; - rGC += d.param + ","; - inputParams[pName] = i_input; - } - } - } - if (isdecl) { - std::string type = ConvertTypeToString(GetTensorType(name)); - if (type == "other") - throw std::runtime_error("TMVA-SOFIE: input tensor " + name + - " is of a data type which is not yet supported."); - rGC += type + "* "; - } - rGC += "tensor_" + name + ","; - i_input++; - } - - if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last "," - return rGC; -} - -namespace { - -std::string createOutputTensor(RModel const &rmodel, std::string const &name, bool isIntermediateTensor) -{ - if(name.empty()) return "{}"; - ETensorType eOutputType = rmodel.GetTensorType(name); - std::string outputType = ConvertTypeToString(eOutputType); - if (isIntermediateTensor) { - - if (eOutputType == ETensorType::BOOL) { - return "fTensor_" + name; - } else { - // need to check is size is the same(don't want to return a vector with larger size) - // in that case better to copy - return "std::vector<" + ConvertTypeToString(eOutputType) + ">(tensor_" + name + ", tensor_" + name + " + " + - std::to_string(ConvertShapeToLength(rmodel.GetTensorShape(name))) + ")"; - } - } - // include also dynamic tensors since the vectors can be allocated with a size larger than their output - // we need a special handling for bool type allocated as vector - auto outputLength = ConvertDynamicShapeToLength(rmodel.GetDynamicTensorShape(name)); - if (rmodel.IsDynamicTensor(name) && eOutputType == ETensorType::BOOL) { - return "std::vector(fTensor_" + name + ".begin(), fTensor_" + name + ".begin() + " + outputLength + ")"; - } - return "std::vector<" + outputType + ">(tensor_" + name + ", tensor_" + name + " + " + outputLength + ")"; -} - -} // namespace - -void RModel::GenerateOutput() { - - if (fVerbose) - std::cout << "Generating main inference code for " << fName << std::endl; - - size_t outputSize = fOutputTensorNames.size(); - // assume output types are all the same - if (outputSize == 0) - throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); - - bool sameOutputTypes = true; - std::string inferReturnType; // type return by infer function - ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin()); - std::string outputType = ConvertTypeToString(eOutputType); - fGC += "\n\n"; - if (outputSize == 1) { - fGC += "std::vector<" + outputType + ">"; - } else { - // if all output types are the same we return an std::vector - otherwise a tuple - for (size_t i = 1; i < outputSize; i++) { - if (GetTensorType(fOutputTensorNames[i]) != eOutputType) - sameOutputTypes = false; - } - if (sameOutputTypes) - fGC += "std::vector>"; - else { - inferReturnType = "std::tuple<"; - for (size_t i = 0; i < outputSize; i++) { - inferReturnType += "std::vector<" + ConvertTypeToString(GetTensorType(fOutputTensorNames[i])) + ">"; - if (i < outputSize-1) inferReturnType += ","; - } - inferReturnType += ">"; - fGC += inferReturnType; - } - } - - fGC += " infer("; - - fGC += GenerateInferSignature(); - - fGC += "){\n"; - - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl; - fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx))); - } - - fGC += SP + "return {"; - for (size_t i = 0; i < outputSize; i++) { - std::string tensorName = *(fOutputTensorNames.begin() + i); - bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0; - fGC += createOutputTensor(*this, tensorName, isIntermediate); - if (i < outputSize - 1) - fGC += ","; - } - fGC += "};\n"; - fGC += "}\n"; // end of infer function scope -} - -void RModel::GenerateSessionCode() -{ - - // define the Session struct (for GNN this is generated in RModel_GNN) - if (fUseSession && !fIsGNNComponent) { - if (!fIsSubGraph) - fGC += "struct Session {\n"; - else - fGC += "struct Session_" + fName + " {\n"; - } - - // generate code for declaring the initialized tensors - GenerateInitializedTensorInfo(); - - // evaluate total intermediate memory and position intermediate tensor addresses - std::string intermediate_memory_alloc_string = ""; - intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); - CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); - } - - // to check remaining unused fragments after memory allocation (lesser the better) - // for (const auto &it: fIntermediateMemoryInfo.available_stack){ - // std::cout<<"chunk_idx: "<fName + " fSession_" + graph->fName + ";\n"; - } - - // Generate code for Session constructor - if (fUseSession) { - std::string sessionName = "Session"; - if (fIsSubGraph) - sessionName += "_" + fName; - // add here specific operator code that needs to define session data members - fGC += "\n"; - for (size_t id = 0; id < fOperators.size(); id++) { - std::string opName = std::to_string(id); - fGC += fOperators[id]->GenerateSessionMembersCode(opName); - } - fGC += "\n"; - // here add initialization and reading of weight tensors - if (fUseWeightFile) { - std::string fileName = fName; - if (fWeightFile == WeightFileType::Text) { - fileName += ".dat"; - } - if (fWeightFile == WeightFileType::RootBinary) { - fileName += ".root"; - } - fGC += sessionName + "(std::string filename =\"" + fileName + "\""; - } else { - // no need to pass weight file since it is not used - // keep passing a string for compatibility - fGC += sessionName + "(std::string = \"\""; - } - // add initialization of shape parameters - // assume all parameters are of type size_t - if (!fShapeParams.empty()) { - for (auto &p : fShapeParams) { - fGC += ",\n"; - fGC += " size_t " + p.first + " = " + p.second; - } - } - fGC += ") {\n"; - - if (fUseWeightFile) { - fGC += "\n//--- reading weights from file\n"; - ReadInitializedTensorsFromFile(fReadPos); - fGC += "\n"; - // fUseWeightFile = fUseWeightFile; - } - - // now we have passed the parameters we can allocate the dynamic tensors - GenerateDynamicTensorInfo(); - - // add here initialization code for operator - for (size_t id = 0; id < fOperators.size(); id++) { - fGC += fOperators[id]->GenerateInitCode(); - } - - fGC += "}\n\n"; - } - // generate the inference code - GenerateOutput(); - - // end of session - if (fUseSession && !fIsGNNComponent) { - fGC += "}; // end of Session\n"; - } -} - -void RModel::Generate(std::underlying_type_t options, int batchSize, long pos, bool verbose) -{ - fVerbose = verbose; - fBatchSize = batchSize; - fReadPos = pos; - - // session flag is used in operator initialize - if (static_cast>(Options::kNoSession) & options) { - fUseSession = false; - fWeightFile = WeightFileType::None; - } - if (static_cast>(Options::kNoWeightFile) & options) { - fUseWeightFile = false; - fWeightFile = WeightFileType::None; - } - if (static_cast>(Options::kRootBinaryWeightFile) & options) { - fUseWeightFile = true; - fWeightFile = WeightFileType::RootBinary; - } - if (fUseWeightFile && !fUseSession) { - throw std::runtime_error( - "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class"); - } - - if (static_cast>(Options::kGNN) & options) - fIsGNN = true; - if (static_cast>(Options::kGNNComponent) & options) - fIsGNNComponent = true; - - // initialize the model including all operators and sub-graphs - Initialize(batchSize, verbose); - - std::string hgname; - if (!fIsGNNComponent && !fIsSubGraph) { - fGC.clear(); - GenerateHeaderInfo(hgname); - } - - // generate first code for the subgraphs - for (auto &graph : fSubGraphs) { - if (fVerbose) - std::cout << "generate session code for subgraph " << graph->fName << std::endl; - graph->GenerateSessionCode(); - fGC += graph->fGC; - } - - if (fVerbose) - std::cout << "generate Main session code - model " << fName << std::endl; - - // generate main session code - GenerateSessionCode(); - - if (!fIsGNNComponent && !fIsSubGraph) { - fGC += ("} //SOFIE_" + fName + "\n"); - fGC += "\n#endif // " + hgname + "\n"; - } -} - -void RModel::ReadInitializedTensorsFromFile(long pos) { - // generate the code to read initialized tensors from a text data file - if (fWeightFile == WeightFileType::Text) { - if (fInitializedTensors.empty()) return; - - fGC += " std::ifstream f;\n"; - fGC += " f.open(filename);\n"; - fGC += " if (!f.is_open()) {\n"; - fGC += " throw std::runtime_error(\"tmva-sofie failed to open file \" + filename + \" for input weights\");\n"; - fGC += " }\n"; - - if(fIsGNNComponent) { - fGC += " f.seekg(" + std::to_string(pos) + ");\n"; - } - - fGC += " std::string tensor_name;\n"; - fGC += " size_t length;\n"; - - // loop on tensors and parse the file - for (auto& i: fInitializedTensors) { - // skip Constant and shape tensors (not written in a file) - if (!i.second.IsWeightTensor()) continue; - std::string tensor_name = "tensor_" + i.first; - if (i.second.type() == ETensorType::FLOAT) { - size_t length = 1; - length = ConvertShapeToLength(i.second.shape()); - std::string slength = std::to_string(length); - fGC += " f >> tensor_name >> length;\n"; - fGC += " if (tensor_name != \"" + tensor_name + "\" ) {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor name; expected name is " + - tensor_name + " , read \" + tensor_name;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " if (length != " + slength + ") {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor size; expected size is " + - slength + " , read \" + std::to_string(length) ;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " for (size_t i = 0; i < length; ++i)\n"; - fGC += " f >> " + tensor_name + "[i];\n"; - fGC += " if (f.fail()) {\n"; - fGC += " throw std::runtime_error(\"TMVA-SOFIE failed to read the values for tensor " + tensor_name + "\");\n"; - fGC += " }\n"; - } else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); - } - } - fGC += " f.close();\n"; - } - - // generate the code to read initialized tensors from a ROOT data file - if(fWeightFile == WeightFileType::RootBinary) { - fGC += " {\n"; - fGC += " std::unique_ptr rootFile(TFile::Open(filename.c_str(), \"READ\"));\n"; - fGC += " if (!rootFile->IsOpen()) {\n"; - fGC += " throw std::runtime_error(\"tmva-sofie failed to open ROOT file for input weights\");\n"; - fGC += " }\n"; - - std::string dirName = fName + "_weights"; - fGC += " if (!rootFile->GetKey(\"" + dirName + "\")) {\n"; - fGC += " throw std::runtime_error(\"tmva-sofie failed to open ROOT directory for input weights\");\n"; - fGC += " }\n"; - - for (auto &i : fInitializedTensors) { - // skip Constant and shape tensors - if (!i.second.IsWeightTensor()) continue; - fGC += " {\n"; - std::string tensor_name = "tensor_" + i.first; - if (i.second.type() == ETensorType::FLOAT) { - fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; - fGC += dirName + "/" + tensor_name + "\"));\n"; - } else if (i.second.type() == ETensorType::DOUBLE) { - fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; - fGC += dirName + + "/" + tensor_name + "\"));\n"; - } else if (i.second.type() == ETensorType::INT64) { - fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; - fGC += dirName + "/" + tensor_name + "\"));\n"; - } else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file"); - } - fGC += " }\n"; - } - fGC += " }\n"; - } -} - -long RModel::WriteInitializedTensorsToFile(std::string filename) { - // Determine the file extension based on the weight file type - std::string fileExtension; - switch (fWeightFile) { - case WeightFileType::None: - fileExtension = ".dat"; - break; - case WeightFileType::RootBinary: - fileExtension = ".root"; - break; - case WeightFileType::Text: - fileExtension = ".dat"; - break; - } - - // If filename is empty, use the model name as the base filename - if (filename.empty()) { - filename = fFileName + fileExtension; - } - - // Write the initialized tensors to the file - if (fWeightFile == WeightFileType::RootBinary) { - if(fIsGNNComponent || fIsGNN) { - throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file."); - } - std::unique_ptr outputFile(TFile::Open(filename.c_str(), "UPDATE")); - - std::string dirName = fName + "_weights"; - // check if directory exists, in case delete to replace with new one - if (outputFile->GetKey(dirName.c_str())) - outputFile->rmdir(dirName.c_str()); - - auto outputDir = outputFile->mkdir(dirName.c_str()); - - for (const auto& item : fInitializedTensors) { - // skip Constant tensors and tensors which are not writable (e.g. shape tensors) - if (!item.second.IsWeightTensor()) continue; - std::string tensorName = "tensor_" + item.first; - size_t length = 1; - length = ConvertShapeToLength(item.second.shape()); - if(item.second.type() == ETensorType::FLOAT) { - const float* data = item.second.data(); - std::vector tensorDataVector(data, data + length); - outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); - } - else if(item.second.type() == ETensorType::DOUBLE) { - const double* data = item.second.data(); - std::vector tensorDataVector(data, data + length); - outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); - } - else if(item.second.type() == ETensorType::INT64) { - const int64_t* data = item.second.data(); - std::vector tensorDataVector(data, data + length); - outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); - } - else { - std::runtime_error("tmva-sofie tensor " + tensorName + " with type " + ConvertTypeToString(item.second.type()) + - " cannot be written to a ROOT file"); - } - } - outputFile->Write(filename.c_str()); - - // this needs to be changed, similar to the text file - return -1; - - } else if (fWeightFile == WeightFileType::Text) { - std::ofstream f; - if(fIsGNNComponent) { - // appending all GNN components into the same file - f.open(filename, std::ios::app); - } else { - f.open(filename); - } - if (!f.is_open()) - throw - std::runtime_error("tmva-sofie failed to open file " + filename + " for tensor weight data"); - for (auto& i: fInitializedTensors) { - // skip Constant tensors and not writable tensors (e.g. shape tensors) - if (!i.second.IsWeightTensor()) { - continue; - } - size_t length = ConvertShapeToLength(i.second.shape()); - std::string tensor_name = "tensor_" + i.first; - f << tensor_name << " " << length << "\n"; - if (i.second.type() == ETensorType::FLOAT) { - const float * data = i.second.data(); - for (size_t idx = 0; idx < length; idx++) { - // round to zero sub-normal values - float value = data[idx]; - if (value != 0. && std::abs(value) < std::numeric_limits::min() ) value = 0; - f << std::setprecision(std::numeric_limits::max_digits10) << value; - f << ( (idx < length-1) ? " " : "\n" ); - } - } - else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file"); - } - if (f.fail()) - std::runtime_error("tmva-sofie failed to write tensor data to file for " + tensor_name); - } - long curr_pos = f.tellp(); - f.close(); - return curr_pos; - } else { - return -1; - } -} - -void RModel::PrintRequiredInputTensors() { - std::cout << "Model requires following inputs:\n"; - for (auto& inputInfo: fInputTensorInfos) { - std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t"; - std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { - if (inputInfo.second.shape[i].isParam) { - std::cout << inputInfo.second.shape[i].param; - } else { - std::cout << inputInfo.second.shape[i].dim ; - } - if (i < inputInfo.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - - for (auto& inputInfo: fReadyInputTensorInfos) { - std::cout << "Fully Specified Tensor name: " << inputInfo.first << "\t"; - std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { - std::cout << inputInfo.second.shape[i]; - if (i < inputInfo.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintInitializedTensors() { - std::cout << "Model initialized the following tensors:\n"; - for (auto& it: fInitializedTensors) { - std::cout << "Tensor name: \"" << it.first << "\"\t"; - std::cout << "type: " << ConvertTypeToString(it.second.type()) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < it.second.shape().size(); i++) { - std::cout << it.second.shape()[i]; - if (i < it.second.shape().size() - 1) std::cout << ","; - } - std::cout << "]"; - if (it.second.IsConstantTensor()) std::cout << " (Constant)"; - else if (!it.second.IsWeightTensor()) std::cout << " (Not Writable)"; - std::cout << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintIntermediateTensors() { - std::cout << "Model specify the following intermediate tensors:\n"; - for (auto& it: fIntermediateTensorInfos) { - std::cout << "Tensor name: \"" << it.first << "\"\t"; - std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < it.second.shape.size(); i++) { - std::cout << it.second.shape[i]; - if (i < it.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintDynamicTensors() { - std::cout << "Model specify the following dynamic tensors:\n"; - for (auto& it: fDynamicTensorInfos) { - std::cout << "Tensor name: \"" << it.first << "\"\t"; - std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < it.second.shape.size(); i++) { - std::cout << it.second.shape[i].GetVal(); - if (i < it.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintOutputTensors() { - std::cout << "Model specify the following output tensors:\n"; - for (auto& it: fOutputTensorNames) { - std::cout << "Tensor name: \"" << it << "\"\t"; - if (!IsDynamicTensor(it)) - std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl; - else - std::cout << "shape: " << ConvertDynamicShapeToString(GetDynamicTensorShape(it)) << std::endl; - } - std::cout << "\n"; -} - -void RModel::HeadInitializedTensors(std::string name, int n_print) { - auto it = fInitializedTensors.find(name); - if (it == fInitializedTensors.end()) { - std::cout << "Tensor " << name << " not found in model's initialized tensor list" << std::endl; - return; - } - - std::cout << "Tensor name: " << it->first << "\t"; - std::cout << "type: " << ConvertTypeToString(it->second.type()) << "\t"; - int length =1; - std::cout << "shape: ["; - for (size_t i = 0; i < it->second.shape().size(); i++) { - std::cout << it->second.shape()[i]; - length *= it->second.shape()[i]; - if (i < it->second.shape().size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - bool ellipsis = true; - if (n_print > length) { - n_print = length; - ellipsis = false; - } - - std::cout << "data: [" << std::endl; - if (it->second.type() == ETensorType::FLOAT) { - auto converted_data = it->second.data(); - for (int i =0; i < n_print; i++) { - std::cout << converted_data[i]; - if (i < n_print - 1) std::cout << " ,"; - } - } - if (ellipsis) std::cout << ", ..."; - std::cout << "]" << std::endl; - -} - -void RModel::OutputGenerated(std::string filename, bool append) { - - RModel_Base::OutputGenerated(filename, append); - - // write weights in a text file - if (fUseWeightFile) { - if (!filename.empty()) { - size_t pos = filename.find(".hxx"); - if (fWeightFile == WeightFileType::Text) - filename.replace(pos, 4, ".dat"); - if (fWeightFile == WeightFileType::RootBinary) { - filename = filename.erase(pos, 4); - filename += ".root"; - } - } else { - filename = fName; - filename += fWeightFile == WeightFileType::Text ? ".dat" : ".root"; - } - WriteInitializedTensorsToFile(filename); - } -} - -void RModel::Streamer(TBuffer &R__b) { - if (R__b.IsReading()) { - RModel::Class()->ReadBuffer(R__b, this); - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastPersistentToShared(); - } - } - else { - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastSharedToPersistent(); - } - RModel::Class()->WriteBuffer(R__b, this); - } -} - -}//SOFIE diff --git a/src/SOFIE_core/test/CMakeLists.txt b/src/SOFIE_core/test/CMakeLists.txt deleted file mode 100644 index 34bb49f..0000000 --- a/src/SOFIE_core/test/CMakeLists.txt +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (C) 1995-2021, Rene Brun and Fons Rademakers. -# All rights reserved. -# -# For the licensing terms see $ROOTSYS/LICENSE. -# For the list of contributors see $ROOTSYS/README/CREDITS. - -############################################################################ -# CMakeLists.txt file for building TMVA SOFIE tests. -# @author Federico Sossai, Sanjiban Sengupta -############################################################################ - -include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_core/inc) -include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_parsers/inc) - -if (NOT ONNX_MODELS_DIR) - set(ONNX_MODELS_DIR input_models) -endif() - -# Finding .onnx files to be parsed and creating the appropriate code to -# parse all file. It is much faster to combine all parsing in a single executable -# which will avoid initialization time (especially when using ROOT) -set(CAPTURE_STR "EmitModel( \"@1\", \"@2\");") -set(ALL_CAPTURES "") -# Finding .onnx files to be parsed and creating the appropriate command -file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx") -foreach(onnx_file ${ONNX_FILES}) - get_filename_component(fname ${onnx_file} NAME_WE) - get_filename_component(fdir ${onnx_file} DIRECTORY) - string(REPLACE "@1" ${onnx_file} cap ${CAPTURE_STR}) - string(REPLACE "@2" ${fname} cap ${cap}) - list(APPEND ALL_CAPTURES ${cap}) -endforeach() -string(REPLACE ";" ";\n" EMIT_CAPTURES "${ALL_CAPTURES}") -configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY) -configure_file(EmitFromRoot.cxx.in EmitFromRoot_all.cxx @ONLY) - -ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx - LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-build) - -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds) - -ROOTTEST_ADD_TEST(SofieCompileModels_ONNX - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX ${onnx_file} ${CMAKE_CURRENT_BINARY_DIR}/${fname} - FIXTURES_REQUIRED sofie-compile-models-onnx-build - FIXTURES_SETUP sofie-compile-models-onnx -) - -# Creating a Google Test -if (BLAS_FOUND) # we need BLAS for compiling the models - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx - LIBRARIES - MathCore - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-onnx - FIXTURES_SETUP - sofie-test-models-onnx-build - ) - target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromONNX - EXEC ./TestCustomModelsFromONNX - FIXTURES_REQUIRED sofie-test-models-onnx-build) -endif() - -# For testing serialisation of RModel object - -ROOTTEST_GENERATE_EXECUTABLE(emitFromROOT EmitFromRoot_all.cxx - LIBRARIES protobuf::libprotobuf RIO SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-root -) -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromROOT PRIVATE -Wno-unused-parameter -Wno-array-bounds) - -# Automatic compilation of headers from root files -ROOTTEST_ADD_TEST(SofieCompileModels_ROOT - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromROOT - FIXTURES_REQUIRED sofie-compile-models-onnx-root - FIXTURES_SETUP sofie-compile-models-root -) - -if (BLAS_FOUND) - # Creating a Google Test for Serialisation of RModel - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromROOT TestCustomModelsFromROOT.cxx - LIBRARIES - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-root - FIXTURES_SETUP - sofie-test-models-root-build - ) - target_include_directories(TestCustomModelsFromROOT PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromROOT - EXEC ./TestCustomModelsFromROOT - FIXTURES_REQUIRED sofie-test-models-root-build) -endif() - -# Look for needed Python modules -ROOT_FIND_PYTHON_MODULE(torch) -if (ROOT_TORCH_FOUND) - configure_file(Conv1dModelGenerator.py Conv1dModelGenerator.py COPYONLY) - configure_file(Conv2dModelGenerator.py Conv2dModelGenerator.py COPYONLY) - configure_file(Conv3dModelGenerator.py Conv3dModelGenerator.py COPYONLY) - configure_file(ConvTrans2dModelGenerator.py ConvTrans2dModelGenerator.py COPYONLY) - configure_file(LinearModelGenerator.py LinearModelGenerator.py COPYONLY) - configure_file(RecurrentModelGenerator.py RecurrentModelGenerator.py COPYONLY) - - if (BLAS_FOUND) - ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx - LIBRARIES - SOFIE_core - SOFIE_parsers - BLAS::BLAS - INCLUDE_DIRS - ${CMAKE_CURRENT_BINARY_DIR} - ) - endif() -endif() - -ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN) - -ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent) diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt new file mode 100644 index 0000000..36cfc55 --- /dev/null +++ b/utils/CMakeLists.txt @@ -0,0 +1,15 @@ +add_library(utils INTERFACE) + +target_include_directories(utils INTERFACE + $ + $ +) + +install(TARGETS utils + EXPORT SOFIETargets +) + +install( + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/SOFIE + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) diff --git a/utils/SOFIE/RTensor.hxx b/utils/SOFIE/RTensor.hxx new file mode 100644 index 0000000..db82dc9 --- /dev/null +++ b/utils/SOFIE/RTensor.hxx @@ -0,0 +1,628 @@ +#ifndef SOFIE_RTENSOR +#define SOFIE_RTENSOR + +#include +#include // std::size_t +#include +#include // std::runtime_error +#include // std::stringstream +#include // std::shared_ptr +#include // std::is_convertible +#include // std::reverse +#include // std::random_access_iterator_tag + +namespace SOFIE { + +/// Memory layout type +enum class MemoryLayout : uint8_t { + RowMajor = 0x01, + ColumnMajor = 0x02 +}; + +namespace Internal { + +/// \brief Get size of tensor from shape vector +/// \param[in] shape Shape vector +/// \return Size of contiguous memory +template +inline std::size_t GetSizeFromShape(const T &shape) +{ + if (shape.size() == 0) + return 0; + std::size_t size = 1; + for (auto &s : shape) + size *= s; + return size; +} + +/// \brief Compute strides from shape vector. +/// \param[in] shape Shape vector +/// \param[in] layout Memory layout +/// \return Size of contiguous memory +/// +/// This information is needed for the multi-dimensional indexing. See here: +/// https://en.wikipedia.org/wiki/Row-_and_column-major_order +/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html +template +inline std::vector ComputeStridesFromShape(const T &shape, MemoryLayout layout) +{ + const auto size = shape.size(); + T strides(size); + if (layout == MemoryLayout::RowMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[size - 1 - i] = 1; + } else { + strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1]; + } + } + } else if (layout == MemoryLayout::ColumnMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[i] = 1; + } else { + strides[i] = strides[i - 1] * shape[i - 1]; + } + } + } else { + std::stringstream ss; + ss << "Memory layout type is not valid for calculating strides."; + throw std::runtime_error(ss.str()); + } + return strides; +} + +/// \brief Compute indices from global index +/// \param[in] shape Shape vector +/// \param[in] idx Global index +/// \param[in] layout Memory layout +/// \return Indice vector +template +inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, const typename T::value_type idx) +{ + const auto size = shape.size(); + auto strides = ComputeStridesFromShape(shape, layout); + T indices(size); + auto r = idx; + for (std::size_t i = 0; i < size; i++) { + indices[i] = int(r / strides[i]); + r = r % strides[i]; + } + return indices; +} + +/// \brief Compute global index from indices +/// \param[in] strides Strides vector +/// \param[in] idx Indice vector +/// \return Global index +template +inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx) +{ + std::size_t globalIndex = 0; + const auto size = idx.size(); + for (std::size_t i = 0; i < size; i++) { + globalIndex += strides[size - 1 - i] * idx[size - 1 - i]; + } + return globalIndex; +} + +/// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible +template +struct and_types : std::true_type { +}; + +template +struct and_types : std::integral_constant()> { +}; + +/// \brief Copy slice of a tensor recursively from here to there +/// \param[in] here Source tensor +/// \param[in] there Target tensor (slice of source tensor) +/// \param[in] mins Minimum of indices for each dimension +/// \param[in] maxs Maximum of indices for each dimension +/// \param[in] idx Current indices +/// \param[in] active Active index needed to stop the recursion +/// +/// Copy the content of a slice of a tensor from source to target. This is done +/// by recursively iterating over the ranges of the slice for each dimension. +template +void RecursiveCopy(const T &here, T &there, + const std::vector &mins, const std::vector &maxs, + std::vector idx, std::size_t active) +{ + const auto size = idx.size(); + for (std::size_t i = mins[active]; i < maxs[active]; i++) { + idx[active] = i; + if (active == size - 1) { + auto idxThere = idx; + for (std::size_t j = 0; j < size; j++) { + idxThere[j] -= mins[j]; + } + there(idxThere) = here(idx); + } else { + Internal::RecursiveCopy(here, there, mins, maxs, idx, active + 1); + } + } +} + +} // namespace SOFIE::Internal + +/// \class SOFIE::RTensor +/// \brief RTensor is a container with contiguous memory and shape information. +/// \tparam T Data-type of the tensor +/// +/// An RTensor is a vector-like container, which has additional shape information. +/// The elements of the multi-dimensional container can be accessed by their +/// indices in a coherent way without taking care about the one-dimensional memory +/// layout of the contiguous storage. This also allows to manipulate the shape +/// of the container without moving the actual elements in memory. Another feature +/// is that an RTensor can own the underlying contiguous memory but can also represent +/// only a view on existing data without owning it. +template > +class RTensor { +public: + // Typedefs + using Value_t = V; + using Shape_t = std::vector; + using Index_t = Shape_t; + using Slice_t = std::vector; + using Container_t = C; + +private: + Shape_t fShape; + Shape_t fStrides; + std::size_t fSize; + MemoryLayout fLayout; + Value_t *fData; + std::shared_ptr fContainer; + +protected: + void ReshapeInplace(const Shape_t &shape); + +public: + // Constructors + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + } + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] strides Strides vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, Shape_t strides, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fStrides(strides), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + } + + /// \brief Construct a tensor owning externally provided data + /// \param[in] container Shared pointer to data container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(std::shared_ptr container, Shape_t shape, + MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fContainer(container) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fData = std::data(*fContainer); + } + + /// \brief Construct a tensor owning data initialized with new container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout) + { + // TODO: Document how data pointer is determined using STL iterator interface. + // TODO: Sanitize given container type with type traits + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fContainer = std::make_shared(fSize); + fData = std::data(*fContainer); + } + + // Access elements + Value_t &operator()(const Index_t &idx); + const Value_t &operator() (const Index_t &idx) const; + template Value_t &operator()(Idx... idx); + template const Value_t &operator() (Idx... idx) const; + + // Access properties + std::size_t GetSize() const { return fSize; } + const Shape_t &GetShape() const { return fShape; } + const Shape_t &GetStrides() const { return fStrides; } + Value_t *GetData() { return fData; } + const Value_t *GetData() const { return fData; } + std::shared_ptr GetContainer() { return fContainer; } + const std::shared_ptr GetContainer() const { return fContainer; } + MemoryLayout GetMemoryLayout() const { return fLayout; } + bool IsView() const { return fContainer == nullptr; } + bool IsOwner() const { return !IsView(); } + + // Copy + RTensor Copy(MemoryLayout layout = MemoryLayout::RowMajor) const; + + // Transformations + RTensor Transpose() const; + RTensor Squeeze() const; + RTensor ExpandDims(int idx) const; + RTensor Reshape(const Shape_t &shape) const; + RTensor Resize(const Shape_t &shape); + RTensor Slice(const Slice_t &slice); + + // Iterator class + class Iterator { + private: + RTensor& fTensor; + Index_t::value_type fGlobalIndex; + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = Value_t; + using difference_type = std::ptrdiff_t; + using pointer = Value_t *; + using reference = Value_t &; + + Iterator(RTensor& x, typename Index_t::value_type idx) : fTensor(x), fGlobalIndex(idx) {} + Iterator& operator++() { fGlobalIndex++; return *this; } + Iterator operator++(int) { auto tmp = *this; operator++(); return tmp; } + Iterator& operator--() { fGlobalIndex--; return *this; } + Iterator operator--(int) { auto tmp = *this; operator--(); return tmp; } + Iterator operator+(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex + rhs); } + Iterator operator-(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex - rhs); } + difference_type operator-(const Iterator& rhs) { return fGlobalIndex - rhs.GetGlobalIndex(); } + Iterator& operator+=(difference_type rhs) { fGlobalIndex += rhs; return *this; } + Iterator& operator-=(difference_type rhs) { fGlobalIndex -= rhs; return *this; } + Value_t& operator*() + { + auto idx = Internal::ComputeIndicesFromGlobalIndex(fTensor.GetShape(), fTensor.GetMemoryLayout(), fGlobalIndex); + return fTensor(idx); + } + bool operator==(const Iterator& rhs) const + { + if (fGlobalIndex == rhs.GetGlobalIndex()) return true; + return false; + } + bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }; + bool operator>(const Iterator& rhs) const { return fGlobalIndex > rhs.GetGlobalIndex(); } + bool operator<(const Iterator& rhs) const { return fGlobalIndex < rhs.GetGlobalIndex(); } + bool operator>=(const Iterator& rhs) const { return fGlobalIndex >= rhs.GetGlobalIndex(); } + bool operator<=(const Iterator& rhs) const { return fGlobalIndex <= rhs.GetGlobalIndex(); } + typename Index_t::value_type GetGlobalIndex() const { return fGlobalIndex; }; + }; + + // Iterator interface + // TODO: Document that the iterator always iterates following the physical memory layout. + Iterator begin() noexcept { + return Iterator(*this, 0); + } + Iterator end() noexcept { + return Iterator(*this, fSize); + } +}; + +/// \brief Reshape tensor in place +/// \param[in] shape Shape vector +/// Reshape tensor without changing the overall size +template +inline void RTensor::ReshapeInplace(const Shape_t &shape) +{ + const auto size = Internal::GetSizeFromShape(shape); + if (size != fSize) { + std::stringstream ss; + ss << "Cannot reshape tensor with size " << fSize << " into shape { "; + for (std::size_t i = 0; i < shape.size(); i++) { + if (i != shape.size() - 1) { + ss << shape[i] << ", "; + } else { + ss << shape[i] << " }."; + } + } + throw std::runtime_error(ss.str()); + } + + // Compute new strides from shape + auto strides = Internal::ComputeStridesFromShape(shape, fLayout); + fShape = shape; + fStrides = strides; +} + + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline Value_t &RTensor::operator()(const Index_t &idx) +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline const Value_t &RTensor::operator() (const Index_t &idx) const +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +Value_t &RTensor::operator()(Idx... idx) +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +const Value_t &RTensor::operator() (Idx... idx) const +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Transpose +/// \returns New RTensor +/// The tensor is transposed by inverting the associated memory layout from row- +/// major to column-major and vice versa. Therefore, the underlying data is not +/// touched. +template +inline RTensor RTensor::Transpose() const +{ + MemoryLayout layout; + // Transpose by inverting memory layout + if (fLayout == MemoryLayout::RowMajor) { + layout = MemoryLayout::ColumnMajor; + } else if (fLayout == MemoryLayout::ColumnMajor) { + layout = MemoryLayout::RowMajor; + } else { + throw std::runtime_error("Memory layout is not known."); + } + + // Create copy of container + RTensor x(fData, fShape, fStrides, layout); + + // Reverse shape + std::reverse(x.fShape.begin(), x.fShape.end()); + + // Reverse strides + std::reverse(x.fStrides.begin(), x.fStrides.end()); + + return x; +} + +/// \brief Squeeze dimensions +/// \returns New RTensor +/// Squeeze removes the dimensions of size one from the shape. +template +inline RTensor RTensor::Squeeze() const +{ + // Remove dimensions of one and associated strides + Shape_t shape; + Shape_t strides; + for (std::size_t i = 0; i < fShape.size(); i++) { + if (fShape[i] != 1) { + shape.emplace_back(fShape[i]); + strides.emplace_back(fStrides[i]); + } + } + + // If all dimensions are 1, we need to keep one. + // This does not apply if the inital shape is already empty. Then, return + // the empty shape. + if (shape.size() == 0 && fShape.size() != 0) { + shape.emplace_back(1); + strides.emplace_back(1); + } + + // Create copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Expand dimensions +/// \param[in] idx Index in shape vector where dimension is added +/// \returns New RTensor +/// Inserts a dimension of one into the shape. +template +inline RTensor RTensor::ExpandDims(int idx) const +{ + // Compose shape vector with additional dimensions and adjust strides + const int len = fShape.size(); + auto shape = fShape; + auto strides = fStrides; + if (idx < 0) { + idx = len + 1 + idx; + } + if (idx < 0) { + throw std::runtime_error("Given negative index is invalid."); + } + else if (idx > len) { + throw std::runtime_error("Given index is invalid."); + } + shape.insert(shape.begin() + idx, 1); + strides = Internal::ComputeStridesFromShape(shape, fLayout); + + // Create view copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Reshape tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Reshape tensor without changing the overall size +template +inline RTensor RTensor::Reshape(const Shape_t &shape) const +{ + // Create copy, replace and return + RTensor x(*this); + x.ReshapeInplace(shape); + return x; +} + +/// \brief Resize tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Resize tensor into new shape +template +inline RTensor RTensor::Resize(const Shape_t &shape) +{ + // Create new tensor with the specified shape + RTensor x(shape, fLayout); + + // Copying contents from previous tensor + size_t n = (x.GetSize()>fSize) ? fSize : x.GetSize(); + std::copy(this->GetData(), this->GetData() + n, x.GetData() ); + + return x; +} + +/// \brief Create a slice of the tensor +/// \param[in] slice Slice vector +/// \returns New RTensor +/// A slice is a subset of the tensor defined by a vector of pairs of indices. +template +inline RTensor RTensor::Slice(const Slice_t &slice) +{ + // Sanitize size of slice + const auto sliceSize = slice.size(); + const auto shapeSize = fShape.size(); + if (sliceSize != shapeSize) { + std::stringstream ss; + ss << "Size of slice (" << sliceSize << ") is unequal number of dimensions (" << shapeSize << ")."; + throw std::runtime_error(ss.str()); + } + + // Sanitize slice indices + // TODO: Sanitize slice indices + /* + for (std::size_t i = 0; i < sliceSize; i++) { + } + */ + + // Convert -1 in slice to proper pair of indices + // TODO + + // Recompute shape and size + Shape_t shape(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + shape[i] = slice[i][1] - slice[i][0]; + } + auto size = Internal::GetSizeFromShape(shape); + + // Determine first element contributing to the slice and get the data pointer + Value_t *data; + Shape_t idx(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + idx[i] = slice[i][0]; + } + data = &operator()(idx); + + // Create copy and modify properties + RTensor x(*this); + x.fData = data; + x.fShape = shape; + x.fSize = size; + + // Squeeze tensor and return + return x.Squeeze(); +} + +/// Copy RTensor to new object +/// \param[in] layout Memory layout of the new RTensor +/// \returns New RTensor +/// The operation copies all elements of the current RTensor to a new RTensor +/// with the given layout contiguous in memory. Note that this copies by default +/// to a row major memory layout. +template +inline RTensor RTensor::Copy(MemoryLayout layout) const +{ + // Create new tensor with zeros owning the memory + RTensor r(fShape, layout); + + // Copy over the elements from this tensor + const auto mins = Shape_t(fShape.size()); + const auto maxs = fShape; + auto idx = mins; + Internal::RecursiveCopy(*this, r, mins, maxs, idx, 0); + + return r; +} + +/// \brief Pretty printing +/// \param[in] os Output stream +/// \param[in] x RTensor +/// \return Modified output stream +template +std::ostream &operator<<(std::ostream &os, RTensor &x) +{ + const auto shapeSize = x.GetShape().size(); + if (shapeSize == 1) { + os << "{ "; + const auto size = x.GetSize(); + for (std::size_t i = 0; i < size; i++) { + os << x({i}); + if (i != size - 1) + os << ", "; + } + os << " }"; + } else if (shapeSize == 2) { + os << "{"; + const auto shape = x.GetShape(); + for (std::size_t i = 0; i < shape[0]; i++) { + os << " { "; + for (std::size_t j = 0; j < shape[1]; j++) { + os << x({i, j}); + if (j < shape[1] - 1) { + os << ", "; + } else { + os << " "; + } + } + os << "}"; + } + os << " }"; + } else { + os << "{ printing not yet implemented for this rank }"; + } + return os; +} + +} // namespace SOFIE + +namespace cling { +template +std::string printValue(SOFIE::RTensor *x) +{ + std::stringstream ss; + ss << *x; + return ss.str(); +} +} // namespace cling + +#endif // SOFIE_RTENSOR