diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..d1793a74 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.h linguist-detectable=false +*.h linguist-language=cpp +*.h linguist-language=cuda \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 92440e77..18545c46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,17 @@ else() message(STATUS "CUDA compiler not found, CUDA support will be disabled.") endif() +#HIP support is only testedon Linux. +check_language(HIP) +if (CMAKE_HIP_COMPILER) + option (ENABLE_HIP "Enable HIP/ROCm AMD GPU support" ON) + if (${ENABLE_HIP}) + include(cmake/hip_init.cmake) + endif() +else() + message(STATUS "HIP compiler not found, HIP support will be disabled.") +endif() + add_subdirectory(include) add_subdirectory(lib) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 69ccf5a0..bfb60413 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -15,6 +15,10 @@ function(add_cuda_to_benchmark TARGET_NAME) endif() endfunction() +function(add_hip_to_benchmark TARGET_NAME) + add_hip_to_target(${TARGET_NAME}) +endfunction() + function (add_generated_benchmark TARGET_NAME TEST_SOURCE EXTENSION DIR) set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target ) @@ -67,6 +71,10 @@ function (discover_benchmark DIR) add_generated_benchmark("${TARGET_NAME}" "${benchmark_source}" "cu" "${DIR_NAME}") add_cuda_to_benchmark("${TARGET_NAME}_cu") endif() + if (CMAKE_HIP_COMPILER AND ENABLE_HIP) + add_generated_benchmark("${TARGET_NAME}" "${benchmark_source}" "hip" "${DIR_NAME}") + add_hip_to_benchmark("${TARGET_NAME}_hip") + endif() endforeach() endfunction() diff --git a/cmake/discover_tests.cmake b/cmake/discover_tests.cmake index 03f6bd9f..ecdcbce7 100644 --- a/cmake/discover_tests.cmake +++ b/cmake/discover_tests.cmake @@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME) endif() endfunction() +function(add_hip_to_test TARGET_NAME) + add_hip_to_target(${TARGET_NAME}) +endfunction() + function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR) set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target ) @@ -33,7 +37,9 @@ function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR) target_include_directories(${TARGET_NAME_EXT} PRIVATE "${CMAKE_SOURCE_DIR}") target_include_directories(${TARGET_NAME_EXT} PRIVATE "${DIR}") target_link_libraries(${TARGET_NAME_EXT} PRIVATE FKL::FKL) - target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver) + if ("${EXTENSION}" STREQUAL "cu") + target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver) + endif() if (NVRTC_ENABLE) target_link_libraries(${TARGET_NAME_EXT} PRIVATE ${NVRTC_LIBRARIES}) target_compile_definitions(${TARGET_NAME_EXT} PRIVATE NVRTC_ENABLE) @@ -78,6 +84,10 @@ function (discover_tests DIR) add_generated_test("${TARGET_NAME}" "${test_source}" "cu" "${DIR_NAME}") add_cuda_to_test("${TARGET_NAME}_cu") endif() + if (CMAKE_HIP_COMPILER AND ENABLE_HIP) + add_generated_test("${TARGET_NAME}" "${test_source}" "hip" "${DIR_NAME}") + add_hip_to_test("${TARGET_NAME}_hip") + endif() endforeach() endfunction() diff --git a/cmake/hip_init.cmake b/cmake/hip_init.cmake new file mode 100644 index 00000000..f57886f6 --- /dev/null +++ b/cmake/hip_init.cmake @@ -0,0 +1,9 @@ +enable_language(HIP) + +# Set HIP compiler and standard +set(CMAKE_HIP_COMPILER "hipcc") +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +include(cmake/libs/hip/hip.cmake) +include(cmake/libs/hip/archs.cmake) diff --git a/cmake/libs/hip/archs.cmake b/cmake/libs/hip/archs.cmake new file mode 100644 index 00000000..e80fbc19 --- /dev/null +++ b/cmake/libs/hip/archs.cmake @@ -0,0 +1,11 @@ +# HIP GPU architecture selection +# Defaults to "native" which lets the compiler auto-detect the target GPU +set(HIP_ARCH "native" CACHE STRING "HIP/ROCm GPU architecture to build for (e.g. native, gfx1100, gfx90a)") + +function(set_target_hip_arch_flags TARGET_NAME) + if ("${HIP_ARCH}" STREQUAL "native") + set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "native") + else() + set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "${HIP_ARCH}") + endif() +endfunction() diff --git a/cmake/libs/hip/hip.cmake b/cmake/libs/hip/hip.cmake new file mode 100644 index 00000000..91c3219c --- /dev/null +++ b/cmake/libs/hip/hip.cmake @@ -0,0 +1,22 @@ +option(ENABLE_HIP_LINE_INFO "Enable line info for HIP kernels compilation" ON) +option(ENABLE_HIP_DEBUG "Generate HIP debug information for device code" OFF) + +include(cmake/libs/hip/target_generation.cmake) +set(ROCM_ROOT "/opt/rocm-7.2.0" CACHE PATH "Root directory of the ROCm installation") +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + +find_package(hip CONFIG REQUIRED) + +function(add_hip_to_target TARGET_NAME) + set_default_hip_target_properties(${TARGET_NAME}) + set_target_hip_arch_flags(${TARGET_NAME}) + + if (${ENABLE_HIP_DEBUG}) + add_hip_debug_support_to_target(${TARGET_NAME}) + endif() + if (${ENABLE_HIP_LINE_INFO}) + add_hip_lineinfo_to_target(${TARGET_NAME}) + endif() + #hip-lang::device hip-lang::amdhip64 + target_link_libraries(${TARGET_NAME} PRIVATE hip::host) +endfunction() diff --git a/cmake/libs/hip/target_generation.cmake b/cmake/libs/hip/target_generation.cmake new file mode 100644 index 00000000..a3c96208 --- /dev/null +++ b/cmake/libs/hip/target_generation.cmake @@ -0,0 +1,27 @@ +function(set_default_hip_target_properties TARGET_NAME) + if (WIN32) + target_compile_options(${TARGET_NAME} PRIVATE $<$:-Xcompiler=/bigobj /Zc:preprocessor>) + endif() + set_target_properties(${TARGET_NAME} PROPERTIES + HIP_STANDARD 17 + HIP_STANDARD_REQUIRED ON + HIP_EXTENSIONS OFF) + if (NOT(${TEMPLATE_DEPTH} STREQUAL "default")) + target_compile_options(${TARGET_NAME} PRIVATE $<$:-ftemplate-depth=${TEMPLATE_DEPTH}>) + if (NOT WIN32) + target_compile_options(${TARGET_NAME} PRIVATE $<$:-ftemplate-depth=${TEMPLATE_DEPTH}>) + endif() + endif() +endfunction() + +function(add_hip_debug_support_to_target TARGET_NAME) + target_compile_options(${TARGET_NAME} PRIVATE $<$,$>:-ggdb>) +endfunction() + +function(add_hip_lineinfo_to_target TARGET_NAME) + if (NOT ${ENABLE_HIP_DEBUG}) + if (CMAKE_HIP_HOST_COMPILER_ID STREQUAL "Clang") + target_compile_options(${TARGET_NAME} PRIVATE $<$:-gline-tables-only>) + endif() + endif() +endfunction() diff --git a/cmake/tests/add_generated_test.cmake b/cmake/tests/add_generated_test.cmake index a0ec9bc4..8602db39 100644 --- a/cmake/tests/add_generated_test.cmake +++ b/cmake/tests/add_generated_test.cmake @@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME) endif() endfunction() +function(add_hip_to_test TARGET_NAME) + add_hip_to_target(${TARGET_NAME}) +endfunction() + function(configure_test_target_flags TARGET_NAME TEST_SOURCE DIR) set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target ) @@ -48,6 +52,8 @@ endfunction() function (set_ide_target_folder TARGET_NAME DIR_PARENT_PATH EXTENSION) if (${EXTENSION} STREQUAL "cu") set(FKL_BACKEND "cuda") + elseif(${EXTENSION} STREQUAL "hip") + set(FKL_BACKEND "hip") elseif(${EXTENSION} STREQUAL "cpp") set(FKL_BACKEND "cpu") else() diff --git a/cmake/tests/add_shared_test_libs.cmake b/cmake/tests/add_shared_test_libs.cmake index 6ed34494..1220d0ff 100644 --- a/cmake/tests/add_shared_test_libs.cmake +++ b/cmake/tests/add_shared_test_libs.cmake @@ -33,7 +33,9 @@ function (add_shared_test_lib TARGET_BASE_NAME DIR EXTENSION FUNDAMENTAL_TYPE) set(TARGET_NAME "${TARGET_BASE_NAME}_${FUNDAMENTAL_TYPE}") add_shared_target("${TARGET_BASE_NAME}" "${EXTENSION}" "${FUNDAMENTAL_TYPE}" "${DIR}") if ("${EXTENSION}" STREQUAL "cu") - add_cuda_to_test("${TARGET_NAME}_${EXTENSION}") + add_cuda_to_test("${TARGET_NAME}_${EXTENSION}") + elseif ("${EXTENSION}" STREQUAL "hip") + add_hip_to_test("${TARGET_NAME}_${EXTENSION}") endif() endfunction() \ No newline at end of file diff --git a/cmake/tests/discover_tests.cmake b/cmake/tests/discover_tests.cmake index 57416626..59e63b53 100644 --- a/cmake/tests/discover_tests.cmake +++ b/cmake/tests/discover_tests.cmake @@ -31,6 +31,13 @@ function (discover_tests DIR) add_generated_test("${TARGET_NAME}" "${TEST_SOURCE}" "cu" "${DIR_RELATIVE_PATH}") add_cuda_to_test("${TARGET_NAME}_cu") endif() + endif() + + if (CMAKE_HIP_COMPILER AND ENABLE_HIP) + if (${POS_ONLY_CPU} EQUAL -1) #if the source file does not contain "__ONLY_CPU__" + add_generated_test("${TARGET_NAME}" "${TEST_SOURCE}" "hip" "${DIR_RELATIVE_PATH}") + add_hip_to_test("${TARGET_NAME}_hip") + endif() endif() endforeach() endfunction() diff --git a/include/fused_kernel/algorithms/image_processing/image.h b/include/fused_kernel/algorithms/image_processing/image.h index e9e352d2..a895e57b 100644 --- a/include/fused_kernel/algorithms/image_processing/image.h +++ b/include/fused_kernel/algorithms/image_processing/image.h @@ -66,7 +66,7 @@ namespace fk { return Image(data.crop(dataPoint, newDataDims), newWidth, newHeight); } #if !defined(NVRTC_COMPILER) -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE inline void uploadTo(Image& other, cudaStream_t stream = 0) { data.uploadTo(other.data, stream); } @@ -74,17 +74,25 @@ namespace fk { inline void downloadTo(Image& other, cudaStream_t stream = 0) { data.downloadTo(other.data, stream); } - +#endif +#if defined(__NVCC__) || CLANG_HOST_DEVICE inline void upload(Stream_& stream) { data.upload(stream); } inline void download(Stream_& stream) { data.download(stream); } +#elif HIP_HOST_DEVICE + inline void upload(Stream_& stream) { + data.upload(stream); + } + inline void download(Stream_& stream) { + data.download(stream); + } #else inline void upload(Stream& stream) {} inline void download(Stream& stream) {} -#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED) +#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE #endif // defined(NVRTC_COMPILER) FK_HOST_CNST VectorType_t::cn> readAt(const Point p) const { diff --git a/include/fused_kernel/core/data/ptr_nd.h b/include/fused_kernel/core/data/ptr_nd.h index 0e70f253..ff90c044 100644 --- a/include/fused_kernel/core/data/ptr_nd.h +++ b/include/fused_kernel/core/data/ptr_nd.h @@ -27,7 +27,7 @@ namespace fk { enum class MemType { Device, Host, HostPinned, DeviceAndPinned }; -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE constexpr MemType defaultMemType = MemType::DeviceAndPinned; #else constexpr MemType defaultMemType = MemType::Host; @@ -152,7 +152,7 @@ namespace fk { } inline constexpr void allocDevice() { - #if defined(__NVCC__) + #if defined(__NVCC__) || HIP_HOST_DEVICE int currentDevice; gpuErrchk(cudaGetDevice(¤tDevice)); gpuErrchk(cudaSetDevice(deviceID)); @@ -171,7 +171,7 @@ namespace fk { } inline constexpr void allocHostPinned() { - #if defined(__NVCC__) + #if defined(__NVCC__) || HIP_HOST_DEVICE int currentDevice; gpuErrchk(cudaGetDevice(¤tDevice)); gpuErrchk(cudaSetDevice(deviceID)); @@ -186,7 +186,7 @@ namespace fk { } inline constexpr void allocDeviceAndPinned() { - #if defined(__NVCC__) + #if defined(__NVCC__) || HIP_HOST_DEVICE int currentDevice; gpuErrchk(cudaGetDevice(¤tDevice)); gpuErrchk(cudaSetDevice(deviceID)); @@ -209,7 +209,7 @@ namespace fk { switch (type) { case MemType::Device: { - #if defined(__NVCC__) + #if defined(__NVCC__) || HIP_HOST_DEVICE gpuErrchk(cudaFree(ref->ptr)); #else throw std::runtime_error("Device memory deallocation not supported in non-CUDA compilation."); @@ -223,7 +223,7 @@ namespace fk { } case MemType::HostPinned: { - #if defined(__NVCC__) + #if defined(__NVCC__) || HIP_HOST_DEVICE gpuErrchk(cudaFreeHost(ref->ptr)); #else throw std::runtime_error("Host pinned memory deallocation not supported in non-CUDA compilation."); @@ -232,7 +232,7 @@ namespace fk { } case MemType::DeviceAndPinned: { -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE gpuErrchk(cudaFree(ref->ptr)); gpuErrchk(cudaFreeHost(ref->pinnedPtr)); #else @@ -249,7 +249,7 @@ namespace fk { } } -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE inline void copy(const RawPtr& thisPtr, RawPtr& other, const cudaMemcpyKind& kind, cudaStream_t stream = 0) const { if ((other.dims.pitch == other.dims.width * sizeof(T)) && (thisPtr.dims.pitch == thisPtr.dims.width * sizeof(T))) { @@ -480,7 +480,7 @@ namespace fk { return *this; } -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE inline void uploadTo(Ptr& other, cudaStream_t stream = 0) { constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice; constexpr MemType otherExpectedMemType1 = MemType::Device; @@ -516,7 +516,8 @@ namespace fk { throw std::runtime_error("Download can only copy from Device pointers."); } } - +#endif +#if defined(__NVCC__) || CLANG_HOST_DEVICE inline void upload(Stream_& stream) { if (type == MemType::DeviceAndPinned) { constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice; @@ -529,10 +530,23 @@ namespace fk { copy(ptr_a, ptr_pinned, kind, stream); } } +#elif HIP_HOST_DEVICE + inline void upload(Stream_& stream) { + if (type == MemType::DeviceAndPinned) { + constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice; + copy(ptr_pinned, ptr_a, kind, stream.getHIPStream()); + } + } + inline void download(Stream_& stream) { + if (type == MemType::DeviceAndPinned) { + constexpr cudaMemcpyKind kind = cudaMemcpyDeviceToHost; + copy(ptr_a, ptr_pinned, kind, stream.getHIPStream()); + } + } #else inline void upload(Stream& stream) {} inline void download(Stream& stream) {} -#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED) +#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE inline T at(const Point p) const { if (type != MemType::Device) { diff --git a/include/fused_kernel/core/data/ptr_utils.h b/include/fused_kernel/core/data/ptr_utils.h index 56dc1f55..483bdb24 100644 --- a/include/fused_kernel/core/data/ptr_utils.h +++ b/include/fused_kernel/core/data/ptr_utils.h @@ -39,6 +39,22 @@ namespace fk { else { Executor>::executeOperations(stream, ReadSet::build(value, outputPtr.dims()), PerThreadWrite::build(output)); } +#elif HIP_HOST_DEVICE + if constexpr (PA == ParArch::GPU_AMD) { + if (outputPtr.getMemType() == MemType::Device || outputPtr.getMemType() == MemType::DeviceAndPinned) { + Executor>::executeOperations(stream, ReadSet::build(value, outputPtr.dims()), PerThreadWrite::build(output)); + if (outputPtr.getMemType() == MemType::DeviceAndPinned) { + Stream_ cpuStream; + Executor>::executeOperations(cpuStream, ReadSet::build(value, outputPtr.dims()), PerThreadWrite::build(outputPtr.ptrPinned())); + } + } + else { + Executor>::executeOperations(stream, ReadSet::build(value, outputPtr.dims()), PerThreadWrite::build(output)); + } + } + else { + Executor>::executeOperations(stream, ReadSet::build(value, outputPtr.dims()), PerThreadWrite::build(output)); + } #else Executor>::executeOperations(stream, ReadSet::build(value, outputPtr.dims()), PerThreadWrite::build(outputPtr)); #endif diff --git a/include/fused_kernel/core/data/vector_types.h b/include/fused_kernel/core/data/vector_types.h index 3ebd2258..d39c11d9 100644 --- a/include/fused_kernel/core/data/vector_types.h +++ b/include/fused_kernel/core/data/vector_types.h @@ -261,6 +261,9 @@ namespace fk { #if defined(__NVCC__) #include +#elif HIP_HOST_DEVICE +// hip_runtime.h (included via utils.h) provides HIP vector types (char1, uchar1, etc.) +// via , so no additional includes needed here. #else using char1 = fk::Char1; using uchar1 = fk::Uchar1; diff --git a/include/fused_kernel/core/execution_model/data_parallel_patterns.h b/include/fused_kernel/core/execution_model/data_parallel_patterns.h index 24da3e92..98a374d6 100644 --- a/include/fused_kernel/core/execution_model/data_parallel_patterns.h +++ b/include/fused_kernel/core/execution_model/data_parallel_patterns.h @@ -68,6 +68,7 @@ namespace fk { // namespace FusedKernel template struct TransformDPPBase { friend struct TransformDPP; // Allow TransformDPP to access private members + friend struct TransformDPP; // Allow TransformDPP to access private members friend struct TransformDPP; // Allow TransformDPPBase to access private members private: using Details = DPPDetails; @@ -235,6 +236,36 @@ namespace fk { // namespace FusedKernel }; #endif // defined(__NVCC__) +#if HIP_HOST_DEVICE + template + struct TransformDPP, void>> { + private: + using Parent = TransformDPPBase; + using Details = DPPDetails; + public: + static constexpr ParArch PAR_ARCH = ParArch::GPU_AMD; + template + FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const Details& details, + const FirstIOp& iOp) { + return Parent::getActiveThreads(details, iOp); + } + + template + FK_DEVICE_FUSE void exec(const Details& details, const IOps&... iOps) { + const int x = (blockDim.x * blockIdx.x) + threadIdx.x; + const int y = (blockDim.y * blockIdx.y) + threadIdx.y; + const int z = blockIdx.z; + const Point thread{ x, y, z }; + + const ActiveThreads activeThreads = getActiveThreads(details, get_arg<0>(iOps...)); + + if (x < activeThreads.x && y < activeThreads.y) { + Parent::execute_thread(thread, activeThreads, iOps...); + } + } + }; +#endif // HIP_HOST_DEVICE + template struct TransformDPP, void>> { private: @@ -267,15 +298,16 @@ namespace fk { // namespace FusedKernel template struct DivergentBatchTransformDPP; - template + template struct DivergentBatchTransformDPPBase { friend struct DivergentBatchTransformDPP; // Allow DivergentBatchTransformDPP to access private members + friend struct DivergentBatchTransformDPP; // Allow DivergentBatchTransformDPP to access private members friend struct DivergentBatchTransformDPP; // Allow DivergentBatchTransformDPPBase to access private members private: template FK_HOST_DEVICE_FUSE void launchTransformDPP(const IOps&... iOps) { using Details = TransformDPPDetails; - TransformDPP::exec(Details{}, iOps...); + TransformDPP::exec(Details{}, iOps...); } template @@ -296,6 +328,9 @@ namespace fk { // namespace FusedKernel template <> struct DivergentBatchTransformDPPDetails {}; + template <> + struct DivergentBatchTransformDPPDetails {}; + template <> struct DivergentBatchTransformDPPDetails { uint numPlanes; @@ -305,7 +340,7 @@ namespace fk { // namespace FusedKernel template struct DivergentBatchTransformDPP { private: - using Parent = DivergentBatchTransformDPPBase; + using Parent = DivergentBatchTransformDPPBase; public: using DPPDetails = DivergentBatchTransformDPPDetails; static constexpr ParArch PAR_ARCH = ParArch::GPU_NVIDIA; @@ -319,10 +354,25 @@ namespace fk { // namespace FusedKernel } }; #endif // defined(__NVCC__) +#if HIP_HOST_DEVICE + template + struct DivergentBatchTransformDPP { + private: + using Parent = DivergentBatchTransformDPPBase; + public: + using DPPDetails = DivergentBatchTransformDPPDetails; + static constexpr ParArch PAR_ARCH = ParArch::GPU_AMD; + template + FK_DEVICE_FUSE void exec(const DPPDetails&, const IOpSequenceTypes&... iOpSequences) { + const uint z = blockIdx.z; + Parent::template divergent_operate<1>(z, iOpSequences...); + } + }; +#endif // HIP_HOST_DEVICE template struct DivergentBatchTransformDPP { private: - using Parent = DivergentBatchTransformDPPBase; + using Parent = DivergentBatchTransformDPPBase; public: using DPPDetails = DivergentBatchTransformDPPDetails; static constexpr ParArch PAR_ARCH = ParArch::CPU; diff --git a/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h b/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h index 41348be9..a8ab8078 100644 --- a/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h +++ b/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h @@ -15,7 +15,7 @@ #ifndef FK_EXECUTOR_KERNELS_H #define FK_EXECUTOR_KERNELS_H -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE namespace fk { template __global__ void launchDivergentBatchTransformDPP_Kernel(const __grid_constant__ DPPDetails details, diff --git a/include/fused_kernel/core/execution_model/executors.h b/include/fused_kernel/core/execution_model/executors.h index d33375e5..8465e720 100644 --- a/include/fused_kernel/core/execution_model/executors.h +++ b/include/fused_kernel/core/execution_model/executors.h @@ -23,13 +23,13 @@ #include #include -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE #include #endif namespace fk { -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE struct CtxDim3 { uint x; uint y; @@ -155,8 +155,9 @@ FK_HOST_FUSE void executeOperations(const std::array, Batch>& input, co struct Executor { FK_STATIC_STRUCT(Executor, Executor) static_assert(DataParallelPattern::PAR_ARCH == ParArch::GPU_NVIDIA || + DataParallelPattern::PAR_ARCH == ParArch::GPU_AMD || DataParallelPattern::PAR_ARCH == ParArch::CPU, - "Only GPU_NVIDIA and CPU supported"); + "Only GPU_NVIDIA, GPU_AMD and CPU are supported"); }; #endif @@ -188,7 +189,7 @@ FK_HOST_FUSE void executeOperations(const std::array, Batch>& input, co DECLARE_EXECUTOR_PARENT_IMPL }; -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE struct ComputeBestSolutionBase { FK_HOST_FUSE uint computeDiscardedThreads(const uint width, const uint height, const uint blockDimx, const uint blockDimy) { const uint modX = width % blockDimx; @@ -359,7 +360,109 @@ FK_HOST_FUSE void executeOperations(const std::array, Batch>& input, co executeOperations_helper(stream, iOpSequences...); } }; -#endif +#endif // defined(__NVCC__) || CLANG_HOST_DEVICE +#if HIP_HOST_DEVICE + template + struct Executor> { + private: + using Child = Executor>; + using Parent = BaseExecutor; + template + FK_HOST_FUSE void executeOperations_helper(Stream_& stream_, const IOps&... iOps) { + const hipStream_t stream = stream_.getHIPStream(); + constexpr ParArch PA = ParArch::GPU_AMD; + const auto tDetails = TransformDPP::build_details(iOps...); + if constexpr (decltype(tDetails)::TFI::ENABLED) { + const ActiveThreads activeThreads = tDetails.activeThreads; + + const CtxDim3 ctx_block = getDefaultBlockSize(activeThreads.x, activeThreads.y); + + const dim3 block{ ctx_block.x, ctx_block.y, 1 }; + const dim3 grid{ static_cast(ceil(activeThreads.x / static_cast(block.x))), + static_cast(ceil(activeThreads.y / static_cast(block.y))), + activeThreads.z }; + if (!tDetails.threadDivisible) { + launchTransformDPP_Kernel<<>>(tDetails, iOps...); + gpuErrchk(hipGetLastError()); + } else { + launchTransformDPP_Kernel<<>>(tDetails, iOps...); + gpuErrchk(hipGetLastError()); + } + } else { + const auto readOp = get_arg<0>(iOps...); + + const ActiveThreads activeThreads = readOp.getActiveThreads(); + + const CtxDim3 ctx_block = getDefaultBlockSize(activeThreads.x, activeThreads.y); + + const dim3 block{ ctx_block.x, ctx_block.y, 1 }; + const dim3 grid{ static_cast(ceil(activeThreads.x / static_cast(block.x))), + static_cast(ceil(activeThreads.y / static_cast(block.y))), + activeThreads.z }; + launchTransformDPP_Kernel<<>>(tDetails, iOps...); + gpuErrchk(hipGetLastError()); + } + } + public: + FK_STATIC_STRUCT(Executor, Child) + FK_HOST_FUSE ParArch parArch() { + return ParArch::GPU_AMD; + } + DECLARE_EXECUTOR_PARENT_IMPL + }; + + template + struct Executor> { + private: + using DPPType = DivergentBatchTransformDPP; + using DPPDetails = typename DPPType::DPPDetails; + using SelfType = Executor; + + template + FK_HOST_FUSE ActiveThreads getActiveThreads(const IOpSequenceTypes&... iOpSequences) { + const uint x = cxp::max::f(get<0>(iOpSequences.iOps).getActiveThreads().x...); + const uint y = cxp::max::f(get<0>(iOpSequences.iOps).getActiveThreads().y...); + const uint z = cxp::sum::f(get<0>(iOpSequences.iOps).getActiveThreads().z...); + return ActiveThreads{ x, y, z }; + } + + template + FK_HOST_FUSE auto fuseBackSequence(const IOpSequence& iOpSeq) { + return buildOperationSequence_tup( + apply([](auto&&... args) { + return BackFuser::fuse_back(std::forward(args)...); + }, iOpSeq.iOps) + ); + } + + template + FK_HOST_FUSE void executeOperationsFused(Stream_& stream, const IOpSequenceTypes&... iOpSequences) { + const ActiveThreads activeThreads = getActiveThreads(iOpSequences...); + const DPPDetails details{}; + + const dim3 block(cxp::min::f(activeThreads.x, 32u), cxp::min::f(activeThreads.y, 8u)); + const dim3 grid(ceil(activeThreads.x / static_cast(block.x)), + ceil(activeThreads.y / static_cast(block.y)), activeThreads.z); + launchDivergentBatchTransformDPP_Kernel<<>>(details, iOpSequences...); + gpuErrchk(hipGetLastError()); + } + + template + FK_HOST_FUSE void executeOperations_helper(Stream_& stream, const IOpSequenceTypes&... iOpSequences) { + executeOperationsFused(stream, fuseBackSequence(iOpSequences)...); + } + + public: + FK_STATIC_STRUCT(Executor, SelfType) + FK_HOST_FUSE ParArch parArch() { + return ParArch::GPU_AMD; + } + template + FK_HOST_FUSE void executeOperations(Stream_& stream, const IOpSequenceTypes&... iOpSequences) { + executeOperations_helper(stream, iOpSequences...); + } + }; +#endif // HIP_HOST_DEVICE } // namespace fk #endif // FK_EXECUTORS_CUH \ No newline at end of file diff --git a/include/fused_kernel/core/execution_model/parallel_architectures.h b/include/fused_kernel/core/execution_model/parallel_architectures.h index 487f6bf9..679d3691 100644 --- a/include/fused_kernel/core/execution_model/parallel_architectures.h +++ b/include/fused_kernel/core/execution_model/parallel_architectures.h @@ -48,7 +48,12 @@ namespace fk { #undef PARALLEL_ARCHITECTURES #if defined(__NVCC__) +#ifndef HIP_HOST_DEVICE +static_assert(false); +#endif constexpr ParArch defaultParArch = ParArch::GPU_NVIDIA; +#elif HIP_HOST_DEVICE == 1 + constexpr ParArch defaultParArch = ParArch::GPU_AMD; #else constexpr ParArch defaultParArch = ParArch::CPU; #endif diff --git a/include/fused_kernel/core/execution_model/stream.h b/include/fused_kernel/core/execution_model/stream.h index 9803eb03..1dc0e756 100644 --- a/include/fused_kernel/core/execution_model/stream.h +++ b/include/fused_kernel/core/execution_model/stream.h @@ -20,8 +20,8 @@ #if defined(__NVCC__) #include -#elif defined(__HIP__) -#include +#elif HIP_HOST_DEVICE +#include #endif namespace fk { @@ -109,6 +109,67 @@ namespace fk { }; #endif +#if HIP_HOST_DEVICE + template <> + class Stream_ final : public BaseStream { + hipStream_t m_stream; + bool m_isMine{ false }; + + inline void initFromOther(const Stream_& other) { + m_stream = other.m_stream; + m_isMine = other.m_isMine; + } + + public: + Stream_() : BaseStream() { + gpuErrchk(hipStreamCreate(&m_stream)); + m_isMine = true; + } + Stream_(const Stream_& other) : BaseStream(other) { + initFromOther(other); + } + explicit Stream_(const hipStream_t& stream) : m_stream(stream), BaseStream() {} + + hipStream_t operator()() const { + return m_stream; + } + + Stream_& operator=(const Stream_& other) { + if (this != &other) { + BaseStream::operator=(other); + initFromOther(other); + } + return *this; + } + + Stream_(Stream_&&) = delete; + Stream_& operator=(Stream_&&) = delete; + + ~Stream_() { + if ( this->getRefCount() == 0 && m_stream != 0 && m_isMine) { + sync(); + gpuErrchk(hipStreamDestroy(m_stream)); + } + } + + operator hipStream_t() const { + return m_stream; + } + inline hipStream_t getHIPStream() const { + return m_stream; + } + inline void sync() final { + gpuErrchk(hipStreamSynchronize(m_stream)); + } + constexpr inline enum ParArch getParArch() const { + return ParArch::GPU_AMD; + }; + static constexpr inline enum ParArch parArch() { + return ParArch::GPU_AMD; + } + }; +#endif + template <> class Stream_ final : public BaseStream { public: diff --git a/include/fused_kernel/core/utils/compiler_macros.h b/include/fused_kernel/core/utils/compiler_macros.h index 41d19186..9795c8d2 100644 --- a/include/fused_kernel/core/utils/compiler_macros.h +++ b/include/fused_kernel/core/utils/compiler_macros.h @@ -22,4 +22,11 @@ #define _MSC_VER_EXISTS 0 #endif +// HIP platform detection: __HIP__ is defined by clang when compiling HIP code +#if defined(__HIP__) +#define HIP_HOST_DEVICE 1 +#else +#define HIP_HOST_DEVICE 0 +#endif + #endif // COMPILER_MACROS_H \ No newline at end of file diff --git a/include/fused_kernel/core/utils/utils.h b/include/fused_kernel/core/utils/utils.h index 307aa2b3..92c0545e 100644 --- a/include/fused_kernel/core/utils/utils.h +++ b/include/fused_kernel/core/utils/utils.h @@ -23,6 +23,8 @@ #if defined(__NVCC__) #include +#elif HIP_HOST_DEVICE +#include #endif #if defined(NVRTC_ENABLED) @@ -31,7 +33,7 @@ #endif #endif // NVRTC_COMPILER -#if defined(__NVCC__) +#if defined(__NVCC__) || HIP_HOST_DEVICE #define FK_DEVICE_FUSE __device__ __forceinline__ static constexpr #define FK_DEVICE_CNST __device__ __forceinline__ constexpr #define FK_HOST_DEVICE_FUSE __host__ FK_DEVICE_FUSE @@ -147,7 +149,27 @@ namespace fk { } // namespace fk #define gpuErrchk(ans) { fk::gpuAssert((ans), __FILE__, __LINE__, true); } -#endif // defined(__NVCC__) +#elif HIP_HOST_DEVICE +namespace fk { + inline void gpuAssert(hipError_t code, + const char *file, + int line, + bool abort = true) { + if (code != hipSuccess) { + std::string message = "GPU Error: "; + message.append(hipGetErrorString(code)); + message.append(" File: "); + message.append(file); + message.append(" Line:"); + message.append(std::to_string(line).c_str()); + message.append("\n"); + if (abort) throw std::runtime_error(message.c_str()); + } + } +} // namespace fk + +#define gpuErrchk(ans) { fk::gpuAssert((ans), __FILE__, __LINE__, true); } +#endif // (__NVCC__) || HIP_HOST_DEVICE // Null type, used for Operation required aliases that can not still be known, // because they are deduced from a backwards operation that is till not defined. diff --git a/tests/main.cpp.hip b/tests/main.cpp.hip new file mode 100644 index 00000000..34e1649c --- /dev/null +++ b/tests/main.cpp.hip @@ -0,0 +1,23 @@ +/* Copyright 2023 Oscar Amoros Huguet + Copyright 2023 Albert Andaluz Gonzalez + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + + + +#include +#include +int main(int argc, char **argv) { + return launch(); +} + diff --git a/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt b/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt index ea9fba20..369b5978 100644 --- a/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt +++ b/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt @@ -44,6 +44,27 @@ if (CMAKE_CUDA_COMPILER AND ENABLE_CUDA) endforeach() endif() - - \ No newline at end of file +if (CMAKE_HIP_COMPILER AND ENABLE_HIP) + set(EXTENSION hip) + string(TOUPPER ${EXTENSION} EXTENSION_UPPER) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_NAME}.h.in + ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${TARGET_NAME}_${EXTENSION}.h) + + add_generated_test_stub("${TARGET_NAME}_hip" + "${CMAKE_BINARY_DIR}/${TARGET_NAME}/${TARGET_NAME}_${EXTENSION}.h" + "") + set_ide_target_folder(${TARGET_NAME}_${EXTENSION} "/utests/algorithm/image_processing" "${EXTENSION}") + + add_hip_to_test("${TARGET_NAME}_hip") + foreach(FUNDAMENTAL_TYPE ${FUNDAMENTAL_TYPES}) + add_shared_test_lib(${TARGET_NAME} + "" + "${EXTENSION}" + "${FUNDAMENTAL_TYPE}" + ) + set_property(TARGET "${TARGET_NAME}_${FUNDAMENTAL_TYPE}_${EXTENSION}" PROPERTY FOLDER "/utests/algorithm/image_processing/hip/${TARGET_NAME}") + target_link_libraries(${TARGET_NAME}_${EXTENSION} PRIVATE "${TARGET_NAME}_${FUNDAMENTAL_TYPE}_${EXTENSION}") + endforeach() + +endif() \ No newline at end of file diff --git a/utests/algorithm/image_processing/utest_saturate/utest_saturate_ftype.hip.in b/utests/algorithm/image_processing/utest_saturate/utest_saturate_ftype.hip.in new file mode 100644 index 00000000..811a5942 --- /dev/null +++ b/utests/algorithm/image_processing/utest_saturate/utest_saturate_ftype.hip.in @@ -0,0 +1,26 @@ +/* Copyright 2025 Oscar Amoros Huguet + Copyright 2025 Grup Mediapro S.L.U + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "utest_saturate_@FUNDAMENTAL_TYPE@_@EXTENSION@.h" + +START_ADDING_TESTS +using Fundamental = fk::RemoveType_t<0, fk::StandardTypes>; +addAllOutputTestsForInput(std::make_index_sequence{}); +STOP_ADDING_TESTS + +int fk::utest_saturate_@FUNDAMENTAL_TYPE@_@EXTENSION@::launch() { + RUN_ALL_TESTS + return 0; +} \ No newline at end of file