Libraries-Openly-Fused · Copilot · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 26, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,3 @@
+*.h linguist-detectable=false
+*.h linguist-language=cpp
+*.h linguist-language=cuda
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -29,6 +29,17 @@ else()
     message(STATUS "CUDA compiler not found, CUDA support will be disabled.")     
 endif()
 
+#HIP support is only testedon Linux.
+check_language(HIP)
+if (CMAKE_HIP_COMPILER)
+    option (ENABLE_HIP "Enable HIP/ROCm AMD GPU support" ON)
+    if (${ENABLE_HIP})    
+        include(cmake/hip_init.cmake)
+    endif()
+else()
+    message(STATUS "HIP compiler not found, HIP support will be disabled.")     
+endif()
+
 add_subdirectory(include)
 add_subdirectory(lib)
 

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -15,6 +15,10 @@ function(add_cuda_to_benchmark TARGET_NAME)
     endif()
 endfunction()
 
+function(add_hip_to_benchmark TARGET_NAME)
+    add_hip_to_target(${TARGET_NAME})
+endfunction()
+
 function (add_generated_benchmark TARGET_NAME TEST_SOURCE EXTENSION DIR)
 
         set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target	)			
@@ -67,6 +71,10 @@ function (discover_benchmark DIR)
             add_generated_benchmark("${TARGET_NAME}"  "${benchmark_source}" "cu"  "${DIR_NAME}")
             add_cuda_to_benchmark("${TARGET_NAME}_cu")            
         endif()
+        if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
+            add_generated_benchmark("${TARGET_NAME}"  "${benchmark_source}" "hip"  "${DIR_NAME}")
+            add_hip_to_benchmark("${TARGET_NAME}_hip")
+        endif()
     endforeach()
 endfunction()
 

diff --git a/cmake/discover_tests.cmake b/cmake/discover_tests.cmake
@@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME)
     endif()
 endfunction()
 
+function(add_hip_to_test TARGET_NAME)
+    add_hip_to_target(${TARGET_NAME})
+endfunction()
+
 function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR)
 
         set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target	)			
@@ -33,7 +37,9 @@ function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR)
         target_include_directories(${TARGET_NAME_EXT} PRIVATE "${CMAKE_SOURCE_DIR}")        
         target_include_directories(${TARGET_NAME_EXT} PRIVATE "${DIR}")      
         target_link_libraries(${TARGET_NAME_EXT} PRIVATE FKL::FKL)
-		target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver)
+        if ("${EXTENSION}" STREQUAL "cu")
+            target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver)
+        endif()
 		if (NVRTC_ENABLE)
 			target_link_libraries(${TARGET_NAME_EXT} PRIVATE ${NVRTC_LIBRARIES})
 			target_compile_definitions(${TARGET_NAME_EXT} PRIVATE NVRTC_ENABLE)
@@ -78,6 +84,10 @@ function (discover_tests DIR)
             add_generated_test("${TARGET_NAME}"  "${test_source}" "cu"  "${DIR_NAME}")
             add_cuda_to_test("${TARGET_NAME}_cu")            
         endif()
+        if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
+            add_generated_test("${TARGET_NAME}"  "${test_source}" "hip"  "${DIR_NAME}")
+            add_hip_to_test("${TARGET_NAME}_hip")
+        endif()
 
     endforeach()
 endfunction()

diff --git a/cmake/hip_init.cmake b/cmake/hip_init.cmake
@@ -0,0 +1,9 @@
+enable_language(HIP)
+
+# Set HIP compiler and standard
+set(CMAKE_HIP_COMPILER "hipcc")
+set(CMAKE_HIP_STANDARD 17)
+set(CMAKE_HIP_STANDARD_REQUIRED ON)
+
+include(cmake/libs/hip/hip.cmake)
+include(cmake/libs/hip/archs.cmake)
diff --git a/cmake/libs/hip/archs.cmake b/cmake/libs/hip/archs.cmake
@@ -0,0 +1,11 @@
+# HIP GPU architecture selection
+# Defaults to "native" which lets the compiler auto-detect the target GPU
+set(HIP_ARCH "native" CACHE STRING "HIP/ROCm GPU architecture to build for (e.g. native, gfx1100, gfx90a)")
+
+function(set_target_hip_arch_flags TARGET_NAME)
+    if ("${HIP_ARCH}" STREQUAL "native")
+        set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "native")
+    else()
+        set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "${HIP_ARCH}")
+    endif()
+endfunction()
diff --git a/cmake/libs/hip/hip.cmake b/cmake/libs/hip/hip.cmake
@@ -0,0 +1,22 @@
+option(ENABLE_HIP_LINE_INFO "Enable line info for HIP kernels compilation" ON)
+option(ENABLE_HIP_DEBUG "Generate HIP debug information for device code" OFF)
+
+include(cmake/libs/hip/target_generation.cmake)
+set(ROCM_ROOT "/opt/rocm-7.2.0" CACHE PATH "Root directory of the ROCm installation")
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+
+find_package(hip CONFIG REQUIRED)
+
+function(add_hip_to_target TARGET_NAME)
+    set_default_hip_target_properties(${TARGET_NAME})
+    set_target_hip_arch_flags(${TARGET_NAME})
+
+    if (${ENABLE_HIP_DEBUG})
+        add_hip_debug_support_to_target(${TARGET_NAME})
+    endif()
+    if (${ENABLE_HIP_LINE_INFO})
+        add_hip_lineinfo_to_target(${TARGET_NAME})
+    endif()
+     #hip-lang::device hip-lang::amdhip64
+    target_link_libraries(${TARGET_NAME} PRIVATE hip::host)
+endfunction()
diff --git a/cmake/libs/hip/target_generation.cmake b/cmake/libs/hip/target_generation.cmake
@@ -0,0 +1,27 @@
+function(set_default_hip_target_properties TARGET_NAME)
+    if (WIN32)
+        target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-Xcompiler=/bigobj /Zc:preprocessor>)
+    endif()
+    set_target_properties(${TARGET_NAME} PROPERTIES
+        HIP_STANDARD 17
+        HIP_STANDARD_REQUIRED ON
+        HIP_EXTENSIONS OFF)
+    if (NOT(${TEMPLATE_DEPTH} STREQUAL "default"))
+        target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-ftemplate-depth=${TEMPLATE_DEPTH}>)
+        if (NOT WIN32)
+            target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-ftemplate-depth=${TEMPLATE_DEPTH}>)
+        endif()
+    endif()
+endfunction()
+
+function(add_hip_debug_support_to_target TARGET_NAME)
+    target_compile_options(${TARGET_NAME} PRIVATE $<$<AND:$<CONFIG:debug>,$<COMPILE_LANGUAGE:HIP>>:-ggdb>)
+endfunction()
+
+function(add_hip_lineinfo_to_target TARGET_NAME)
+    if (NOT ${ENABLE_HIP_DEBUG})
+          if (CMAKE_HIP_HOST_COMPILER_ID STREQUAL "Clang")
+        target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-gline-tables-only>)        
+        endif()
+    endif()
+endfunction()
diff --git a/cmake/tests/add_generated_test.cmake b/cmake/tests/add_generated_test.cmake
@@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME)
     endif()
 endfunction()
 
+function(add_hip_to_test TARGET_NAME)
+    add_hip_to_target(${TARGET_NAME})
+endfunction()
+
 function(configure_test_target_flags TARGET_NAME TEST_SOURCE DIR)
 
         set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target	)			       
@@ -48,6 +52,8 @@ endfunction()
 function (set_ide_target_folder  TARGET_NAME  DIR_PARENT_PATH EXTENSION) 
  if (${EXTENSION} STREQUAL "cu")
             set(FKL_BACKEND "cuda")
+        elseif(${EXTENSION} STREQUAL "hip")
+            set(FKL_BACKEND "hip")
         elseif(${EXTENSION} STREQUAL "cpp")  
             set(FKL_BACKEND "cpu")
         else()

diff --git a/cmake/tests/add_shared_test_libs.cmake b/cmake/tests/add_shared_test_libs.cmake
@@ -33,7 +33,9 @@ function (add_shared_test_lib TARGET_BASE_NAME DIR EXTENSION FUNDAMENTAL_TYPE)
     set(TARGET_NAME "${TARGET_BASE_NAME}_${FUNDAMENTAL_TYPE}")                    
     add_shared_target("${TARGET_BASE_NAME}" "${EXTENSION}" "${FUNDAMENTAL_TYPE}" "${DIR}")         
     if ("${EXTENSION}" STREQUAL "cu")
-       add_cuda_to_test("${TARGET_NAME}_${EXTENSION}")                   
+       add_cuda_to_test("${TARGET_NAME}_${EXTENSION}")
+    elseif ("${EXTENSION}" STREQUAL "hip")
+       add_hip_to_test("${TARGET_NAME}_${EXTENSION}")
     endif()                    
 
 endfunction()
diff --git a/cmake/tests/discover_tests.cmake b/cmake/tests/discover_tests.cmake
@@ -31,6 +31,13 @@ function (discover_tests DIR)
                 add_generated_test("${TARGET_NAME}"  "${TEST_SOURCE}" "cu"  "${DIR_RELATIVE_PATH}")
                 add_cuda_to_test("${TARGET_NAME}_cu")                           
             endif()
+        endif()
+
+        if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
+            if (${POS_ONLY_CPU} EQUAL -1) #if the source file does not contain "__ONLY_CPU__"
+                add_generated_test("${TARGET_NAME}"  "${TEST_SOURCE}" "hip"  "${DIR_RELATIVE_PATH}")
+                add_hip_to_test("${TARGET_NAME}_hip")
+            endif()
         endif()         
     endforeach()   
 endfunction()

diff --git a/include/fused_kernel/algorithms/image_processing/image.h b/include/fused_kernel/algorithms/image_processing/image.h
@@ -66,25 +66,33 @@ namespace fk {
             return Image<PF>(data.crop(dataPoint, newDataDims), newWidth, newHeight);
         }
 #if !defined(NVRTC_COMPILER)
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
         inline void uploadTo(Image& other, cudaStream_t stream = 0) {
             data.uploadTo(other.data, stream);
         }
 
         inline void downloadTo(Image& other, cudaStream_t stream = 0) {
             data.downloadTo(other.data, stream);
         }
-
+#endif
+#if defined(__NVCC__) || CLANG_HOST_DEVICE
         inline void upload(Stream_<ParArch::GPU_NVIDIA>& stream) {
             data.upload(stream);
         }
         inline void download(Stream_<ParArch::GPU_NVIDIA>& stream) {
             data.download(stream);
         }
+#elif HIP_HOST_DEVICE
+        inline void upload(Stream_<ParArch::GPU_AMD>& stream) {
+            data.upload(stream);
+        }
+        inline void download(Stream_<ParArch::GPU_AMD>& stream) {
+            data.download(stream);
+        }
 #else
         inline void upload(Stream& stream) {}
         inline void download(Stream& stream) {}
-#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED)
+#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE
 #endif // defined(NVRTC_COMPILER)
 
         FK_HOST_CNST VectorType_t<BaseType, PixelFormatTraits<PF>::cn> readAt(const Point p) const {

diff --git a/include/fused_kernel/core/data/ptr_nd.h b/include/fused_kernel/core/data/ptr_nd.h
@@ -27,7 +27,7 @@
 
 namespace fk {
 	enum class MemType { Device, Host, HostPinned, DeviceAndPinned };
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
     constexpr MemType defaultMemType = MemType::DeviceAndPinned;
 #else
     constexpr MemType defaultMemType = MemType::Host;
@@ -152,7 +152,7 @@ namespace fk {
         }
 
         inline constexpr void allocDevice() {
-            #if defined(__NVCC__)
+            #if defined(__NVCC__) || HIP_HOST_DEVICE
             int currentDevice;
             gpuErrchk(cudaGetDevice(&currentDevice));
             gpuErrchk(cudaSetDevice(deviceID));
@@ -171,7 +171,7 @@ namespace fk {
         }
 
         inline constexpr void allocHostPinned() {
-            #if defined(__NVCC__)
+            #if defined(__NVCC__) || HIP_HOST_DEVICE
             int currentDevice;
             gpuErrchk(cudaGetDevice(&currentDevice));
             gpuErrchk(cudaSetDevice(deviceID));
@@ -186,7 +186,7 @@ namespace fk {
         }
 
         inline constexpr void allocDeviceAndPinned() {
-            #if defined(__NVCC__)
+            #if defined(__NVCC__) || HIP_HOST_DEVICE
             int currentDevice;
             gpuErrchk(cudaGetDevice(&currentDevice));
             gpuErrchk(cudaSetDevice(deviceID));
@@ -209,7 +209,7 @@ namespace fk {
                 switch (type) {
                 case MemType::Device:
                     {
-                        #if defined(__NVCC__)
+                        #if defined(__NVCC__) || HIP_HOST_DEVICE
                         gpuErrchk(cudaFree(ref->ptr));
                         #else
                         throw std::runtime_error("Device memory deallocation not supported in non-CUDA compilation.");
@@ -223,7 +223,7 @@ namespace fk {
                     }
                 case MemType::HostPinned:
                     {
-                        #if defined(__NVCC__)
+                        #if defined(__NVCC__) || HIP_HOST_DEVICE
                         gpuErrchk(cudaFreeHost(ref->ptr));
                         #else
                         throw std::runtime_error("Host pinned memory deallocation not supported in non-CUDA compilation.");
@@ -232,7 +232,7 @@ namespace fk {
                     }
                 case MemType::DeviceAndPinned:
                 {
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
                     gpuErrchk(cudaFree(ref->ptr));
                     gpuErrchk(cudaFreeHost(ref->pinnedPtr));
 #else
@@ -249,7 +249,7 @@ namespace fk {
             }
         }
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
         inline void copy(const RawPtr<D, T>& thisPtr, RawPtr<D, T>& other, const cudaMemcpyKind& kind,
                          cudaStream_t stream = 0) const {
             if ((other.dims.pitch == other.dims.width * sizeof(T)) && (thisPtr.dims.pitch == thisPtr.dims.width * sizeof(T))) {
@@ -480,7 +480,7 @@ namespace fk {
             return *this;
         }
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
         inline void uploadTo(Ptr<D, T>& other, cudaStream_t stream = 0) {
             constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
             constexpr MemType otherExpectedMemType1 = MemType::Device;
@@ -516,7 +516,8 @@ namespace fk {
                 throw std::runtime_error("Download can only copy from Device pointers.");
             }
         }
-
+#endif
+#if defined(__NVCC__) || CLANG_HOST_DEVICE
         inline void upload(Stream_<ParArch::GPU_NVIDIA>& stream) {
             if (type == MemType::DeviceAndPinned) {
                 constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
@@ -529,10 +530,23 @@ namespace fk {
                 copy(ptr_a, ptr_pinned, kind, stream);
             }
         }
+#elif HIP_HOST_DEVICE
+        inline void upload(Stream_<ParArch::GPU_AMD>& stream) {
+            if (type == MemType::DeviceAndPinned) {
+                constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
+                copy(ptr_pinned, ptr_a, kind, stream.getHIPStream());
+            }
+        }
+        inline void download(Stream_<ParArch::GPU_AMD>& stream) {
+            if (type == MemType::DeviceAndPinned) {
+                constexpr cudaMemcpyKind kind = cudaMemcpyDeviceToHost;
+                copy(ptr_a, ptr_pinned, kind, stream.getHIPStream());
+            }
+        }
 #else
         inline void upload(Stream& stream) {}
         inline void download(Stream& stream) {}
-#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED)
+#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE
 
         inline T at(const Point p) const {
             if (type != MemType::Device) {

diff --git a/include/fused_kernel/core/data/ptr_utils.h b/include/fused_kernel/core/data/ptr_utils.h
@@ -39,6 +39,22 @@ namespace fk {
         else {
             Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
         }
+#elif HIP_HOST_DEVICE
+        if constexpr (PA == ParArch::GPU_AMD) {
+            if (outputPtr.getMemType() == MemType::Device || outputPtr.getMemType() == MemType::DeviceAndPinned) {
+                Executor<TransformDPP<ParArch::GPU_AMD>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
+                if (outputPtr.getMemType() == MemType::DeviceAndPinned) {
+                    Stream_<ParArch::CPU> cpuStream;
+                    Executor<TransformDPP<ParArch::CPU>>::executeOperations(cpuStream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(outputPtr.ptrPinned()));
+                }
+            }
+            else {
+                Executor<TransformDPP<ParArch::GPU_AMD>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
+            }
+        }
+        else {
+            Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
+        }
 #else
         Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(outputPtr));
 #endif

diff --git a/include/fused_kernel/core/data/vector_types.h b/include/fused_kernel/core/data/vector_types.h
@@ -261,6 +261,9 @@ namespace fk {
 
 #if defined(__NVCC__)
 #include <vector_types.h>
+#elif HIP_HOST_DEVICE
+// hip_runtime.h (included via utils.h) provides HIP vector types (char1, uchar1, etc.)
+// via <hip/hip_vector_types.h>, so no additional includes needed here.
 #else
 using char1 = fk::Char1;
 using uchar1 = fk::Uchar1;