Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.h linguist-detectable=false
*.h linguist-language=cpp
*.h linguist-language=cuda
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ else()
message(STATUS "CUDA compiler not found, CUDA support will be disabled.")
endif()

#HIP support is only testedon Linux.
check_language(HIP)
if (CMAKE_HIP_COMPILER)
option (ENABLE_HIP "Enable HIP/ROCm AMD GPU support" ON)
if (${ENABLE_HIP})
include(cmake/hip_init.cmake)
endif()
else()
message(STATUS "HIP compiler not found, HIP support will be disabled.")
endif()

add_subdirectory(include)
add_subdirectory(lib)

Expand Down
8 changes: 8 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ function(add_cuda_to_benchmark TARGET_NAME)
endif()
endfunction()

function(add_hip_to_benchmark TARGET_NAME)
add_hip_to_target(${TARGET_NAME})
endfunction()

function (add_generated_benchmark TARGET_NAME TEST_SOURCE EXTENSION DIR)

set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target )
Expand Down Expand Up @@ -67,6 +71,10 @@ function (discover_benchmark DIR)
add_generated_benchmark("${TARGET_NAME}" "${benchmark_source}" "cu" "${DIR_NAME}")
add_cuda_to_benchmark("${TARGET_NAME}_cu")
endif()
if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
add_generated_benchmark("${TARGET_NAME}" "${benchmark_source}" "hip" "${DIR_NAME}")
add_hip_to_benchmark("${TARGET_NAME}_hip")
endif()
endforeach()
endfunction()

Expand Down
12 changes: 11 additions & 1 deletion cmake/discover_tests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME)
endif()
endfunction()

function(add_hip_to_test TARGET_NAME)
add_hip_to_target(${TARGET_NAME})
endfunction()

function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR)

set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target )
Expand All @@ -33,7 +37,9 @@ function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR)
target_include_directories(${TARGET_NAME_EXT} PRIVATE "${CMAKE_SOURCE_DIR}")
target_include_directories(${TARGET_NAME_EXT} PRIVATE "${DIR}")
target_link_libraries(${TARGET_NAME_EXT} PRIVATE FKL::FKL)
target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver)
if ("${EXTENSION}" STREQUAL "cu")
target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver)
endif()
if (NVRTC_ENABLE)
target_link_libraries(${TARGET_NAME_EXT} PRIVATE ${NVRTC_LIBRARIES})
target_compile_definitions(${TARGET_NAME_EXT} PRIVATE NVRTC_ENABLE)
Expand Down Expand Up @@ -78,6 +84,10 @@ function (discover_tests DIR)
add_generated_test("${TARGET_NAME}" "${test_source}" "cu" "${DIR_NAME}")
add_cuda_to_test("${TARGET_NAME}_cu")
endif()
if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
add_generated_test("${TARGET_NAME}" "${test_source}" "hip" "${DIR_NAME}")
add_hip_to_test("${TARGET_NAME}_hip")
endif()

endforeach()
endfunction()
Expand Down
9 changes: 9 additions & 0 deletions cmake/hip_init.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
enable_language(HIP)

# Set HIP compiler and standard
set(CMAKE_HIP_COMPILER "hipcc")
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_STANDARD_REQUIRED ON)

include(cmake/libs/hip/hip.cmake)
include(cmake/libs/hip/archs.cmake)
11 changes: 11 additions & 0 deletions cmake/libs/hip/archs.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# HIP GPU architecture selection
# Defaults to "native" which lets the compiler auto-detect the target GPU
set(HIP_ARCH "native" CACHE STRING "HIP/ROCm GPU architecture to build for (e.g. native, gfx1100, gfx90a)")

function(set_target_hip_arch_flags TARGET_NAME)
if ("${HIP_ARCH}" STREQUAL "native")
set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "native")
else()
set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "${HIP_ARCH}")
endif()
endfunction()
22 changes: 22 additions & 0 deletions cmake/libs/hip/hip.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
option(ENABLE_HIP_LINE_INFO "Enable line info for HIP kernels compilation" ON)
option(ENABLE_HIP_DEBUG "Generate HIP debug information for device code" OFF)

include(cmake/libs/hip/target_generation.cmake)
set(ROCM_ROOT "/opt/rocm-7.2.0" CACHE PATH "Root directory of the ROCm installation")
list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")

find_package(hip CONFIG REQUIRED)

function(add_hip_to_target TARGET_NAME)
set_default_hip_target_properties(${TARGET_NAME})
set_target_hip_arch_flags(${TARGET_NAME})

if (${ENABLE_HIP_DEBUG})
add_hip_debug_support_to_target(${TARGET_NAME})
endif()
if (${ENABLE_HIP_LINE_INFO})
add_hip_lineinfo_to_target(${TARGET_NAME})
endif()
#hip-lang::device hip-lang::amdhip64
target_link_libraries(${TARGET_NAME} PRIVATE hip::host)
endfunction()
27 changes: 27 additions & 0 deletions cmake/libs/hip/target_generation.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
function(set_default_hip_target_properties TARGET_NAME)
if (WIN32)
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-Xcompiler=/bigobj /Zc:preprocessor>)
endif()
set_target_properties(${TARGET_NAME} PROPERTIES
HIP_STANDARD 17
HIP_STANDARD_REQUIRED ON
HIP_EXTENSIONS OFF)
if (NOT(${TEMPLATE_DEPTH} STREQUAL "default"))
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-ftemplate-depth=${TEMPLATE_DEPTH}>)
if (NOT WIN32)
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-ftemplate-depth=${TEMPLATE_DEPTH}>)
endif()
endif()
endfunction()

function(add_hip_debug_support_to_target TARGET_NAME)
target_compile_options(${TARGET_NAME} PRIVATE $<$<AND:$<CONFIG:debug>,$<COMPILE_LANGUAGE:HIP>>:-ggdb>)
endfunction()

function(add_hip_lineinfo_to_target TARGET_NAME)
if (NOT ${ENABLE_HIP_DEBUG})
if (CMAKE_HIP_HOST_COMPILER_ID STREQUAL "Clang")
target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-gline-tables-only>)
endif()
endif()
endfunction()
6 changes: 6 additions & 0 deletions cmake/tests/add_generated_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME)
endif()
endfunction()

function(add_hip_to_test TARGET_NAME)
add_hip_to_target(${TARGET_NAME})
endfunction()

function(configure_test_target_flags TARGET_NAME TEST_SOURCE DIR)

set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target )
Expand Down Expand Up @@ -48,6 +52,8 @@ endfunction()
function (set_ide_target_folder TARGET_NAME DIR_PARENT_PATH EXTENSION)
if (${EXTENSION} STREQUAL "cu")
set(FKL_BACKEND "cuda")
elseif(${EXTENSION} STREQUAL "hip")
set(FKL_BACKEND "hip")
elseif(${EXTENSION} STREQUAL "cpp")
set(FKL_BACKEND "cpu")
else()
Expand Down
4 changes: 3 additions & 1 deletion cmake/tests/add_shared_test_libs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ function (add_shared_test_lib TARGET_BASE_NAME DIR EXTENSION FUNDAMENTAL_TYPE)
set(TARGET_NAME "${TARGET_BASE_NAME}_${FUNDAMENTAL_TYPE}")
add_shared_target("${TARGET_BASE_NAME}" "${EXTENSION}" "${FUNDAMENTAL_TYPE}" "${DIR}")
if ("${EXTENSION}" STREQUAL "cu")
add_cuda_to_test("${TARGET_NAME}_${EXTENSION}")
add_cuda_to_test("${TARGET_NAME}_${EXTENSION}")
elseif ("${EXTENSION}" STREQUAL "hip")
add_hip_to_test("${TARGET_NAME}_${EXTENSION}")
endif()

endfunction()
7 changes: 7 additions & 0 deletions cmake/tests/discover_tests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ function (discover_tests DIR)
add_generated_test("${TARGET_NAME}" "${TEST_SOURCE}" "cu" "${DIR_RELATIVE_PATH}")
add_cuda_to_test("${TARGET_NAME}_cu")
endif()
endif()

if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
if (${POS_ONLY_CPU} EQUAL -1) #if the source file does not contain "__ONLY_CPU__"
add_generated_test("${TARGET_NAME}" "${TEST_SOURCE}" "hip" "${DIR_RELATIVE_PATH}")
add_hip_to_test("${TARGET_NAME}_hip")
endif()
endif()
endforeach()
endfunction()
Expand Down
14 changes: 11 additions & 3 deletions include/fused_kernel/algorithms/image_processing/image.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,25 +66,33 @@ namespace fk {
return Image<PF>(data.crop(dataPoint, newDataDims), newWidth, newHeight);
}
#if !defined(NVRTC_COMPILER)
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
inline void uploadTo(Image& other, cudaStream_t stream = 0) {
data.uploadTo(other.data, stream);
}

inline void downloadTo(Image& other, cudaStream_t stream = 0) {
data.downloadTo(other.data, stream);
}

#endif
#if defined(__NVCC__) || CLANG_HOST_DEVICE
inline void upload(Stream_<ParArch::GPU_NVIDIA>& stream) {
data.upload(stream);
}
inline void download(Stream_<ParArch::GPU_NVIDIA>& stream) {
data.download(stream);
}
#elif HIP_HOST_DEVICE
inline void upload(Stream_<ParArch::GPU_AMD>& stream) {
data.upload(stream);
}
inline void download(Stream_<ParArch::GPU_AMD>& stream) {
data.download(stream);
}
#else
inline void upload(Stream& stream) {}
inline void download(Stream& stream) {}
#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED)
#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE
#endif // defined(NVRTC_COMPILER)

FK_HOST_CNST VectorType_t<BaseType, PixelFormatTraits<PF>::cn> readAt(const Point p) const {
Expand Down
36 changes: 25 additions & 11 deletions include/fused_kernel/core/data/ptr_nd.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

namespace fk {
enum class MemType { Device, Host, HostPinned, DeviceAndPinned };
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
constexpr MemType defaultMemType = MemType::DeviceAndPinned;
#else
constexpr MemType defaultMemType = MemType::Host;
Expand Down Expand Up @@ -152,7 +152,7 @@ namespace fk {
}

inline constexpr void allocDevice() {
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
int currentDevice;
gpuErrchk(cudaGetDevice(&currentDevice));
gpuErrchk(cudaSetDevice(deviceID));
Expand All @@ -171,7 +171,7 @@ namespace fk {
}

inline constexpr void allocHostPinned() {
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
int currentDevice;
gpuErrchk(cudaGetDevice(&currentDevice));
gpuErrchk(cudaSetDevice(deviceID));
Expand All @@ -186,7 +186,7 @@ namespace fk {
}

inline constexpr void allocDeviceAndPinned() {
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
int currentDevice;
gpuErrchk(cudaGetDevice(&currentDevice));
gpuErrchk(cudaSetDevice(deviceID));
Expand All @@ -209,7 +209,7 @@ namespace fk {
switch (type) {
case MemType::Device:
{
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
gpuErrchk(cudaFree(ref->ptr));
#else
throw std::runtime_error("Device memory deallocation not supported in non-CUDA compilation.");
Expand All @@ -223,7 +223,7 @@ namespace fk {
}
case MemType::HostPinned:
{
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
gpuErrchk(cudaFreeHost(ref->ptr));
#else
throw std::runtime_error("Host pinned memory deallocation not supported in non-CUDA compilation.");
Expand All @@ -232,7 +232,7 @@ namespace fk {
}
case MemType::DeviceAndPinned:
{
#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
gpuErrchk(cudaFree(ref->ptr));
gpuErrchk(cudaFreeHost(ref->pinnedPtr));
#else
Expand All @@ -249,7 +249,7 @@ namespace fk {
}
}

#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
inline void copy(const RawPtr<D, T>& thisPtr, RawPtr<D, T>& other, const cudaMemcpyKind& kind,
cudaStream_t stream = 0) const {
if ((other.dims.pitch == other.dims.width * sizeof(T)) && (thisPtr.dims.pitch == thisPtr.dims.width * sizeof(T))) {
Expand Down Expand Up @@ -480,7 +480,7 @@ namespace fk {
return *this;
}

#if defined(__NVCC__)
#if defined(__NVCC__) || HIP_HOST_DEVICE
inline void uploadTo(Ptr<D, T>& other, cudaStream_t stream = 0) {
constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
constexpr MemType otherExpectedMemType1 = MemType::Device;
Expand Down Expand Up @@ -516,7 +516,8 @@ namespace fk {
throw std::runtime_error("Download can only copy from Device pointers.");
}
}

#endif
#if defined(__NVCC__) || CLANG_HOST_DEVICE
inline void upload(Stream_<ParArch::GPU_NVIDIA>& stream) {
if (type == MemType::DeviceAndPinned) {
constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
Expand All @@ -529,10 +530,23 @@ namespace fk {
copy(ptr_a, ptr_pinned, kind, stream);
}
}
#elif HIP_HOST_DEVICE
inline void upload(Stream_<ParArch::GPU_AMD>& stream) {
if (type == MemType::DeviceAndPinned) {
constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
copy(ptr_pinned, ptr_a, kind, stream.getHIPStream());
}
}
inline void download(Stream_<ParArch::GPU_AMD>& stream) {
if (type == MemType::DeviceAndPinned) {
constexpr cudaMemcpyKind kind = cudaMemcpyDeviceToHost;
copy(ptr_a, ptr_pinned, kind, stream.getHIPStream());
}
}
#else
inline void upload(Stream& stream) {}
inline void download(Stream& stream) {}
#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED)
#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE

inline T at(const Point p) const {
if (type != MemType::Device) {
Expand Down
16 changes: 16 additions & 0 deletions include/fused_kernel/core/data/ptr_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@ namespace fk {
else {
Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
}
#elif HIP_HOST_DEVICE
if constexpr (PA == ParArch::GPU_AMD) {
if (outputPtr.getMemType() == MemType::Device || outputPtr.getMemType() == MemType::DeviceAndPinned) {
Executor<TransformDPP<ParArch::GPU_AMD>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
if (outputPtr.getMemType() == MemType::DeviceAndPinned) {
Stream_<ParArch::CPU> cpuStream;
Executor<TransformDPP<ParArch::CPU>>::executeOperations(cpuStream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(outputPtr.ptrPinned()));
}
}
else {
Executor<TransformDPP<ParArch::GPU_AMD>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
}
}
else {
Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
}
#else
Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(outputPtr));
#endif
Expand Down
3 changes: 3 additions & 0 deletions include/fused_kernel/core/data/vector_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,9 @@ namespace fk {

#if defined(__NVCC__)
#include <vector_types.h>
#elif HIP_HOST_DEVICE
// hip_runtime.h (included via utils.h) provides HIP vector types (char1, uchar1, etc.)
// via <hip/hip_vector_types.h>, so no additional includes needed here.
#else
using char1 = fk::Char1;
using uchar1 = fk::Uchar1;
Expand Down
Loading
Loading