diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..d1793a74
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+*.h linguist-detectable=false
+*.h linguist-language=cpp
+*.h linguist-language=cuda
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92440e77..18545c46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,17 @@ else()
     message(STATUS "CUDA compiler not found, CUDA support will be disabled.")     
 endif()
 
+#HIP support is only testedon Linux.
+check_language(HIP)
+if (CMAKE_HIP_COMPILER)
+    option (ENABLE_HIP "Enable HIP/ROCm AMD GPU support" ON)
+    if (${ENABLE_HIP})    
+        include(cmake/hip_init.cmake)
+    endif()
+else()
+    message(STATUS "HIP compiler not found, HIP support will be disabled.")     
+endif()
+
 add_subdirectory(include)
 add_subdirectory(lib)
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 69ccf5a0..bfb60413 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -15,6 +15,10 @@ function(add_cuda_to_benchmark TARGET_NAME)
     endif()
 endfunction()
 
+function(add_hip_to_benchmark TARGET_NAME)
+    add_hip_to_target(${TARGET_NAME})
+endfunction()
+
 function (add_generated_benchmark TARGET_NAME TEST_SOURCE EXTENSION DIR)
                        
         set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target	)			
@@ -67,6 +71,10 @@ function (discover_benchmark DIR)
             add_generated_benchmark("${TARGET_NAME}"  "${benchmark_source}" "cu"  "${DIR_NAME}")
             add_cuda_to_benchmark("${TARGET_NAME}_cu")            
         endif()
+        if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
+            add_generated_benchmark("${TARGET_NAME}"  "${benchmark_source}" "hip"  "${DIR_NAME}")
+            add_hip_to_benchmark("${TARGET_NAME}_hip")
+        endif()
     endforeach()
 endfunction()
 
diff --git a/cmake/discover_tests.cmake b/cmake/discover_tests.cmake
index 03f6bd9f..ecdcbce7 100644
--- a/cmake/discover_tests.cmake
+++ b/cmake/discover_tests.cmake
@@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME)
     endif()
 endfunction()
 
+function(add_hip_to_test TARGET_NAME)
+    add_hip_to_target(${TARGET_NAME})
+endfunction()
+
 function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR)
                        
         set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target	)			
@@ -33,7 +37,9 @@ function (add_generated_test TARGET_NAME TEST_SOURCE EXTENSION DIR)
         target_include_directories(${TARGET_NAME_EXT} PRIVATE "${CMAKE_SOURCE_DIR}")        
         target_include_directories(${TARGET_NAME_EXT} PRIVATE "${DIR}")      
         target_link_libraries(${TARGET_NAME_EXT} PRIVATE FKL::FKL)
-		target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver)
+        if ("${EXTENSION}" STREQUAL "cu")
+            target_link_libraries(${TARGET_NAME_EXT} PRIVATE CUDA::cuda_driver)
+        endif()
 		if (NVRTC_ENABLE)
 			target_link_libraries(${TARGET_NAME_EXT} PRIVATE ${NVRTC_LIBRARIES})
 			target_compile_definitions(${TARGET_NAME_EXT} PRIVATE NVRTC_ENABLE)
@@ -78,6 +84,10 @@ function (discover_tests DIR)
             add_generated_test("${TARGET_NAME}"  "${test_source}" "cu"  "${DIR_NAME}")
             add_cuda_to_test("${TARGET_NAME}_cu")            
         endif()
+        if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
+            add_generated_test("${TARGET_NAME}"  "${test_source}" "hip"  "${DIR_NAME}")
+            add_hip_to_test("${TARGET_NAME}_hip")
+        endif()
       
     endforeach()
 endfunction()
diff --git a/cmake/hip_init.cmake b/cmake/hip_init.cmake
new file mode 100644
index 00000000..f57886f6
--- /dev/null
+++ b/cmake/hip_init.cmake
@@ -0,0 +1,9 @@
+enable_language(HIP)
+
+# Set HIP compiler and standard
+set(CMAKE_HIP_COMPILER "hipcc")
+set(CMAKE_HIP_STANDARD 17)
+set(CMAKE_HIP_STANDARD_REQUIRED ON)
+
+include(cmake/libs/hip/hip.cmake)
+include(cmake/libs/hip/archs.cmake)
diff --git a/cmake/libs/hip/archs.cmake b/cmake/libs/hip/archs.cmake
new file mode 100644
index 00000000..e80fbc19
--- /dev/null
+++ b/cmake/libs/hip/archs.cmake
@@ -0,0 +1,11 @@
+# HIP GPU architecture selection
+# Defaults to "native" which lets the compiler auto-detect the target GPU
+set(HIP_ARCH "native" CACHE STRING "HIP/ROCm GPU architecture to build for (e.g. native, gfx1100, gfx90a)")
+
+function(set_target_hip_arch_flags TARGET_NAME)
+    if ("${HIP_ARCH}" STREQUAL "native")
+        set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "native")
+    else()
+        set_target_properties(${TARGET_NAME} PROPERTIES HIP_ARCHITECTURES "${HIP_ARCH}")
+    endif()
+endfunction()
diff --git a/cmake/libs/hip/hip.cmake b/cmake/libs/hip/hip.cmake
new file mode 100644
index 00000000..91c3219c
--- /dev/null
+++ b/cmake/libs/hip/hip.cmake
@@ -0,0 +1,22 @@
+option(ENABLE_HIP_LINE_INFO "Enable line info for HIP kernels compilation" ON)
+option(ENABLE_HIP_DEBUG "Generate HIP debug information for device code" OFF)
+ 
+include(cmake/libs/hip/target_generation.cmake)
+set(ROCM_ROOT "/opt/rocm-7.2.0" CACHE PATH "Root directory of the ROCm installation")
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+
+find_package(hip CONFIG REQUIRED)
+
+function(add_hip_to_target TARGET_NAME)
+    set_default_hip_target_properties(${TARGET_NAME})
+    set_target_hip_arch_flags(${TARGET_NAME})
+
+    if (${ENABLE_HIP_DEBUG})
+        add_hip_debug_support_to_target(${TARGET_NAME})
+    endif()
+    if (${ENABLE_HIP_LINE_INFO})
+        add_hip_lineinfo_to_target(${TARGET_NAME})
+    endif()
+     #hip-lang::device hip-lang::amdhip64
+    target_link_libraries(${TARGET_NAME} PRIVATE hip::host)
+endfunction()
diff --git a/cmake/libs/hip/target_generation.cmake b/cmake/libs/hip/target_generation.cmake
new file mode 100644
index 00000000..a3c96208
--- /dev/null
+++ b/cmake/libs/hip/target_generation.cmake
@@ -0,0 +1,27 @@
+function(set_default_hip_target_properties TARGET_NAME)
+    if (WIN32)
+        target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-Xcompiler=/bigobj /Zc:preprocessor>)
+    endif()
+    set_target_properties(${TARGET_NAME} PROPERTIES
+        HIP_STANDARD 17
+        HIP_STANDARD_REQUIRED ON
+        HIP_EXTENSIONS OFF)
+    if (NOT(${TEMPLATE_DEPTH} STREQUAL "default"))
+        target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-ftemplate-depth=${TEMPLATE_DEPTH}>)
+        if (NOT WIN32)
+            target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-ftemplate-depth=${TEMPLATE_DEPTH}>)
+        endif()
+    endif()
+endfunction()
+
+function(add_hip_debug_support_to_target TARGET_NAME)
+    target_compile_options(${TARGET_NAME} PRIVATE $<$<AND:$<CONFIG:debug>,$<COMPILE_LANGUAGE:HIP>>:-ggdb>)
+endfunction()
+
+function(add_hip_lineinfo_to_target TARGET_NAME)
+    if (NOT ${ENABLE_HIP_DEBUG})
+          if (CMAKE_HIP_HOST_COMPILER_ID STREQUAL "Clang")
+        target_compile_options(${TARGET_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-gline-tables-only>)        
+        endif()
+    endif()
+endfunction()
diff --git a/cmake/tests/add_generated_test.cmake b/cmake/tests/add_generated_test.cmake
index a0ec9bc4..8602db39 100644
--- a/cmake/tests/add_generated_test.cmake
+++ b/cmake/tests/add_generated_test.cmake
@@ -15,6 +15,10 @@ function(add_cuda_to_test TARGET_NAME)
     endif()
 endfunction()
 
+function(add_hip_to_test TARGET_NAME)
+    add_hip_to_target(${TARGET_NAME})
+endfunction()
+
 function(configure_test_target_flags TARGET_NAME TEST_SOURCE DIR)
         
         set(TEST_GENERATED_SOURCE "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_${EXTENSION}/launcher.${EXTENSION}") #use the same name as the target	)			       
@@ -48,6 +52,8 @@ endfunction()
 function (set_ide_target_folder  TARGET_NAME  DIR_PARENT_PATH EXTENSION) 
  if (${EXTENSION} STREQUAL "cu")
             set(FKL_BACKEND "cuda")
+        elseif(${EXTENSION} STREQUAL "hip")
+            set(FKL_BACKEND "hip")
         elseif(${EXTENSION} STREQUAL "cpp")  
             set(FKL_BACKEND "cpu")
         else()
diff --git a/cmake/tests/add_shared_test_libs.cmake b/cmake/tests/add_shared_test_libs.cmake
index 6ed34494..1220d0ff 100644
--- a/cmake/tests/add_shared_test_libs.cmake
+++ b/cmake/tests/add_shared_test_libs.cmake
@@ -33,7 +33,9 @@ function (add_shared_test_lib TARGET_BASE_NAME DIR EXTENSION FUNDAMENTAL_TYPE)
     set(TARGET_NAME "${TARGET_BASE_NAME}_${FUNDAMENTAL_TYPE}")                    
     add_shared_target("${TARGET_BASE_NAME}" "${EXTENSION}" "${FUNDAMENTAL_TYPE}" "${DIR}")         
     if ("${EXTENSION}" STREQUAL "cu")
-       add_cuda_to_test("${TARGET_NAME}_${EXTENSION}")                   
+       add_cuda_to_test("${TARGET_NAME}_${EXTENSION}")
+    elseif ("${EXTENSION}" STREQUAL "hip")
+       add_hip_to_test("${TARGET_NAME}_${EXTENSION}")
     endif()                    
    
 endfunction()
\ No newline at end of file
diff --git a/cmake/tests/discover_tests.cmake b/cmake/tests/discover_tests.cmake
index 57416626..59e63b53 100644
--- a/cmake/tests/discover_tests.cmake
+++ b/cmake/tests/discover_tests.cmake
@@ -31,6 +31,13 @@ function (discover_tests DIR)
                 add_generated_test("${TARGET_NAME}"  "${TEST_SOURCE}" "cu"  "${DIR_RELATIVE_PATH}")
                 add_cuda_to_test("${TARGET_NAME}_cu")                           
             endif()
+        endif()
+
+        if (CMAKE_HIP_COMPILER AND ENABLE_HIP)
+            if (${POS_ONLY_CPU} EQUAL -1) #if the source file does not contain "__ONLY_CPU__"
+                add_generated_test("${TARGET_NAME}"  "${TEST_SOURCE}" "hip"  "${DIR_RELATIVE_PATH}")
+                add_hip_to_test("${TARGET_NAME}_hip")
+            endif()
         endif()         
     endforeach()   
 endfunction()
diff --git a/include/fused_kernel/algorithms/image_processing/image.h b/include/fused_kernel/algorithms/image_processing/image.h
index e9e352d2..a895e57b 100644
--- a/include/fused_kernel/algorithms/image_processing/image.h
+++ b/include/fused_kernel/algorithms/image_processing/image.h
@@ -66,7 +66,7 @@ namespace fk {
             return Image<PF>(data.crop(dataPoint, newDataDims), newWidth, newHeight);
         }
 #if !defined(NVRTC_COMPILER)
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
         inline void uploadTo(Image& other, cudaStream_t stream = 0) {
             data.uploadTo(other.data, stream);
         }
@@ -74,17 +74,25 @@ namespace fk {
         inline void downloadTo(Image& other, cudaStream_t stream = 0) {
             data.downloadTo(other.data, stream);
         }
-
+#endif
+#if defined(__NVCC__) || CLANG_HOST_DEVICE
         inline void upload(Stream_<ParArch::GPU_NVIDIA>& stream) {
             data.upload(stream);
         }
         inline void download(Stream_<ParArch::GPU_NVIDIA>& stream) {
             data.download(stream);
         }
+#elif HIP_HOST_DEVICE
+        inline void upload(Stream_<ParArch::GPU_AMD>& stream) {
+            data.upload(stream);
+        }
+        inline void download(Stream_<ParArch::GPU_AMD>& stream) {
+            data.download(stream);
+        }
 #else
         inline void upload(Stream& stream) {}
         inline void download(Stream& stream) {}
-#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED)
+#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE
 #endif // defined(NVRTC_COMPILER)
 
         FK_HOST_CNST VectorType_t<BaseType, PixelFormatTraits<PF>::cn> readAt(const Point p) const {
diff --git a/include/fused_kernel/core/data/ptr_nd.h b/include/fused_kernel/core/data/ptr_nd.h
index 0e70f253..ff90c044 100644
--- a/include/fused_kernel/core/data/ptr_nd.h
+++ b/include/fused_kernel/core/data/ptr_nd.h
@@ -27,7 +27,7 @@
 
 namespace fk {
 	enum class MemType { Device, Host, HostPinned, DeviceAndPinned };
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
     constexpr MemType defaultMemType = MemType::DeviceAndPinned;
 #else
     constexpr MemType defaultMemType = MemType::Host;
@@ -152,7 +152,7 @@ namespace fk {
         }
 
         inline constexpr void allocDevice() {
-            #if defined(__NVCC__)
+            #if defined(__NVCC__) || HIP_HOST_DEVICE
             int currentDevice;
             gpuErrchk(cudaGetDevice(&currentDevice));
             gpuErrchk(cudaSetDevice(deviceID));
@@ -171,7 +171,7 @@ namespace fk {
         }
 
         inline constexpr void allocHostPinned() {
-            #if defined(__NVCC__)
+            #if defined(__NVCC__) || HIP_HOST_DEVICE
             int currentDevice;
             gpuErrchk(cudaGetDevice(&currentDevice));
             gpuErrchk(cudaSetDevice(deviceID));
@@ -186,7 +186,7 @@ namespace fk {
         }
 
         inline constexpr void allocDeviceAndPinned() {
-            #if defined(__NVCC__)
+            #if defined(__NVCC__) || HIP_HOST_DEVICE
             int currentDevice;
             gpuErrchk(cudaGetDevice(&currentDevice));
             gpuErrchk(cudaSetDevice(deviceID));
@@ -209,7 +209,7 @@ namespace fk {
                 switch (type) {
                 case MemType::Device:
                     {
-                        #if defined(__NVCC__)
+                        #if defined(__NVCC__) || HIP_HOST_DEVICE
                         gpuErrchk(cudaFree(ref->ptr));
                         #else
                         throw std::runtime_error("Device memory deallocation not supported in non-CUDA compilation.");
@@ -223,7 +223,7 @@ namespace fk {
                     }
                 case MemType::HostPinned:
                     {
-                        #if defined(__NVCC__)
+                        #if defined(__NVCC__) || HIP_HOST_DEVICE
                         gpuErrchk(cudaFreeHost(ref->ptr));
                         #else
                         throw std::runtime_error("Host pinned memory deallocation not supported in non-CUDA compilation.");
@@ -232,7 +232,7 @@ namespace fk {
                     }
                 case MemType::DeviceAndPinned:
                 {
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
                     gpuErrchk(cudaFree(ref->ptr));
                     gpuErrchk(cudaFreeHost(ref->pinnedPtr));
 #else
@@ -249,7 +249,7 @@ namespace fk {
             }
         }
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
         inline void copy(const RawPtr<D, T>& thisPtr, RawPtr<D, T>& other, const cudaMemcpyKind& kind,
                          cudaStream_t stream = 0) const {
             if ((other.dims.pitch == other.dims.width * sizeof(T)) && (thisPtr.dims.pitch == thisPtr.dims.width * sizeof(T))) {
@@ -480,7 +480,7 @@ namespace fk {
             return *this;
         }
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
         inline void uploadTo(Ptr<D, T>& other, cudaStream_t stream = 0) {
             constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
             constexpr MemType otherExpectedMemType1 = MemType::Device;
@@ -516,7 +516,8 @@ namespace fk {
                 throw std::runtime_error("Download can only copy from Device pointers.");
             }
         }
-
+#endif
+#if defined(__NVCC__) || CLANG_HOST_DEVICE
         inline void upload(Stream_<ParArch::GPU_NVIDIA>& stream) {
             if (type == MemType::DeviceAndPinned) {
                 constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
@@ -529,10 +530,23 @@ namespace fk {
                 copy(ptr_a, ptr_pinned, kind, stream);
             }
         }
+#elif HIP_HOST_DEVICE
+        inline void upload(Stream_<ParArch::GPU_AMD>& stream) {
+            if (type == MemType::DeviceAndPinned) {
+                constexpr cudaMemcpyKind kind = cudaMemcpyHostToDevice;
+                copy(ptr_pinned, ptr_a, kind, stream.getHIPStream());
+            }
+        }
+        inline void download(Stream_<ParArch::GPU_AMD>& stream) {
+            if (type == MemType::DeviceAndPinned) {
+                constexpr cudaMemcpyKind kind = cudaMemcpyDeviceToHost;
+                copy(ptr_a, ptr_pinned, kind, stream.getHIPStream());
+            }
+        }
 #else
         inline void upload(Stream& stream) {}
         inline void download(Stream& stream) {}
-#endif // defined(__NVCC__) || defined(__HIP__) || defined(NVRTC_ENABLED)
+#endif // defined(__NVCC__) || CLANG_HOST_DEVICE || HIP_HOST_DEVICE
 
         inline T at(const Point p) const {
             if (type != MemType::Device) {
diff --git a/include/fused_kernel/core/data/ptr_utils.h b/include/fused_kernel/core/data/ptr_utils.h
index 56dc1f55..483bdb24 100644
--- a/include/fused_kernel/core/data/ptr_utils.h
+++ b/include/fused_kernel/core/data/ptr_utils.h
@@ -39,6 +39,22 @@ namespace fk {
         else {
             Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
         }
+#elif HIP_HOST_DEVICE
+        if constexpr (PA == ParArch::GPU_AMD) {
+            if (outputPtr.getMemType() == MemType::Device || outputPtr.getMemType() == MemType::DeviceAndPinned) {
+                Executor<TransformDPP<ParArch::GPU_AMD>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
+                if (outputPtr.getMemType() == MemType::DeviceAndPinned) {
+                    Stream_<ParArch::CPU> cpuStream;
+                    Executor<TransformDPP<ParArch::CPU>>::executeOperations(cpuStream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(outputPtr.ptrPinned()));
+                }
+            }
+            else {
+                Executor<TransformDPP<ParArch::GPU_AMD>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
+            }
+        }
+        else {
+            Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(output));
+        }
 #else
         Executor<TransformDPP<PA>>::executeOperations(stream, ReadSet<T>::build(value, outputPtr.dims()), PerThreadWrite<D, T>::build(outputPtr));
 #endif
diff --git a/include/fused_kernel/core/data/vector_types.h b/include/fused_kernel/core/data/vector_types.h
index 3ebd2258..d39c11d9 100644
--- a/include/fused_kernel/core/data/vector_types.h
+++ b/include/fused_kernel/core/data/vector_types.h
@@ -261,6 +261,9 @@ namespace fk {
 
 #if defined(__NVCC__)
 #include <vector_types.h>
+#elif HIP_HOST_DEVICE
+// hip_runtime.h (included via utils.h) provides HIP vector types (char1, uchar1, etc.)
+// via <hip/hip_vector_types.h>, so no additional includes needed here.
 #else
 using char1 = fk::Char1;
 using uchar1 = fk::Uchar1;
diff --git a/include/fused_kernel/core/execution_model/data_parallel_patterns.h b/include/fused_kernel/core/execution_model/data_parallel_patterns.h
index 24da3e92..98a374d6 100644
--- a/include/fused_kernel/core/execution_model/data_parallel_patterns.h
+++ b/include/fused_kernel/core/execution_model/data_parallel_patterns.h
@@ -68,6 +68,7 @@ namespace fk { // namespace FusedKernel
     template <enum TF TFEN = TF::DISABLED, typename DPPDetails = void, bool THREAD_DIVISIBLE = true>
     struct TransformDPPBase {
         friend struct TransformDPP<ParArch::GPU_NVIDIA, TFEN, DPPDetails, THREAD_DIVISIBLE>; // Allow TransformDPP to access private members
+        friend struct TransformDPP<ParArch::GPU_AMD, TFEN, DPPDetails, THREAD_DIVISIBLE>; // Allow TransformDPP<GPU_AMD> to access private members
         friend struct TransformDPP<ParArch::CPU, TFEN, DPPDetails, THREAD_DIVISIBLE>; // Allow TransformDPPBase to access private members
     private:
         using Details = DPPDetails;
@@ -235,6 +236,36 @@ namespace fk { // namespace FusedKernel
     };
 #endif // defined(__NVCC__)
 
+#if HIP_HOST_DEVICE
+    template <typename DPPDetails, enum TF TFEN, bool THREAD_DIVISIBLE>
+    struct TransformDPP<ParArch::GPU_AMD, TFEN, DPPDetails, THREAD_DIVISIBLE, std::enable_if_t<!std::is_same_v<DPPDetails, void>, void>> {
+    private:
+        using Parent = TransformDPPBase<TFEN, DPPDetails, THREAD_DIVISIBLE>;
+        using Details = DPPDetails;
+    public:
+        static constexpr ParArch PAR_ARCH = ParArch::GPU_AMD;
+        template <typename FirstIOp>
+        FK_HOST_DEVICE_FUSE ActiveThreads getActiveThreads(const Details& details,
+                                                            const FirstIOp& iOp) {
+            return Parent::getActiveThreads(details, iOp);
+        }
+
+        template <typename... IOps>
+        FK_DEVICE_FUSE void exec(const Details& details, const IOps&... iOps) {
+            const int x = (blockDim.x * blockIdx.x) + threadIdx.x;
+            const int y = (blockDim.y * blockIdx.y) + threadIdx.y;
+            const int z = blockIdx.z;
+            const Point thread{ x, y, z };
+
+            const ActiveThreads activeThreads = getActiveThreads(details, get_arg<0>(iOps...));
+
+            if (x < activeThreads.x && y < activeThreads.y) {
+                Parent::execute_thread(thread, activeThreads, iOps...);
+            }
+        }
+    };
+#endif // HIP_HOST_DEVICE
+
     template <enum TF TFEN, typename DPPDetails, bool THREAD_DIVISIBLE>
     struct TransformDPP<ParArch::CPU, TFEN, DPPDetails, THREAD_DIVISIBLE, std::enable_if_t<!std::is_same_v<DPPDetails, void>, void>> {
     private:
@@ -267,15 +298,16 @@ namespace fk { // namespace FusedKernel
     template <enum ParArch PA, typename SequenceSelector>
     struct DivergentBatchTransformDPP;
 
-    template <typename SequenceSelector>
+    template <ParArch PA, typename SequenceSelector>
     struct DivergentBatchTransformDPPBase {
         friend struct DivergentBatchTransformDPP<ParArch::GPU_NVIDIA, SequenceSelector>; // Allow DivergentBatchTransformDPP to access private members
+        friend struct DivergentBatchTransformDPP<ParArch::GPU_AMD, SequenceSelector>; // Allow DivergentBatchTransformDPP<GPU_AMD> to access private members
         friend struct DivergentBatchTransformDPP<ParArch::CPU, SequenceSelector>; // Allow DivergentBatchTransformDPPBase to access private members
     private:
         template <typename... IOps>
         FK_HOST_DEVICE_FUSE void launchTransformDPP(const IOps&... iOps) {
             using Details = TransformDPPDetails<false, IOps...>;
-            TransformDPP<ParArch::GPU_NVIDIA, TF::DISABLED, Details, true>::exec(Details{}, iOps...);
+            TransformDPP<PA, TF::DISABLED, Details, true>::exec(Details{}, iOps...);
         }
 
         template <int OpSequenceNumber, typename... IOps, typename... IOpSequenceTypes>
@@ -296,6 +328,9 @@ namespace fk { // namespace FusedKernel
     template <>
     struct DivergentBatchTransformDPPDetails<ParArch::GPU_NVIDIA> {};
 
+    template <>
+    struct DivergentBatchTransformDPPDetails<ParArch::GPU_AMD> {};
+
     template <>
     struct DivergentBatchTransformDPPDetails<ParArch::CPU> {
         uint numPlanes;
@@ -305,7 +340,7 @@ namespace fk { // namespace FusedKernel
     template <typename SequenceSelector>
     struct DivergentBatchTransformDPP<ParArch::GPU_NVIDIA, SequenceSelector> {
     private:
-        using Parent = DivergentBatchTransformDPPBase<SequenceSelector>;
+        using Parent = DivergentBatchTransformDPPBase<ParArch::GPU_NVIDIA, SequenceSelector>;
     public:
         using DPPDetails = DivergentBatchTransformDPPDetails<ParArch::GPU_NVIDIA>;
         static constexpr ParArch PAR_ARCH = ParArch::GPU_NVIDIA;
@@ -319,10 +354,25 @@ namespace fk { // namespace FusedKernel
         }
     };
 #endif // defined(__NVCC__)
+#if HIP_HOST_DEVICE
+    template <typename SequenceSelector>
+    struct DivergentBatchTransformDPP<ParArch::GPU_AMD, SequenceSelector> {
+    private:
+        using Parent = DivergentBatchTransformDPPBase<ParArch::GPU_AMD, SequenceSelector>;
+    public:
+        using DPPDetails = DivergentBatchTransformDPPDetails<ParArch::GPU_AMD>;
+        static constexpr ParArch PAR_ARCH = ParArch::GPU_AMD;
+        template <typename... IOpSequenceTypes>
+        FK_DEVICE_FUSE void exec(const DPPDetails&, const IOpSequenceTypes&... iOpSequences) {
+            const uint z = blockIdx.z;
+            Parent::template divergent_operate<1>(z, iOpSequences...);
+        }
+    };
+#endif // HIP_HOST_DEVICE
     template <typename SequenceSelector>
     struct DivergentBatchTransformDPP<ParArch::CPU, SequenceSelector> {
     private:
-        using Parent = DivergentBatchTransformDPPBase<SequenceSelector>;
+        using Parent = DivergentBatchTransformDPPBase<ParArch::CPU, SequenceSelector>;
     public:
         using DPPDetails = DivergentBatchTransformDPPDetails<ParArch::CPU>;
         static constexpr ParArch PAR_ARCH = ParArch::CPU;
diff --git a/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h b/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h
index 41348be9..a8ab8078 100644
--- a/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h
+++ b/include/fused_kernel/core/execution_model/executor_details/executor_kernels.h
@@ -15,7 +15,7 @@
 #ifndef FK_EXECUTOR_KERNELS_H
 #define FK_EXECUTOR_KERNELS_H
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
 namespace fk {
 template <ParArch PA, typename SequenceSelector, typename DPPDetails, typename... IOpSequences>
 __global__ void launchDivergentBatchTransformDPP_Kernel(const __grid_constant__ DPPDetails details,
diff --git a/include/fused_kernel/core/execution_model/executors.h b/include/fused_kernel/core/execution_model/executors.h
index d33375e5..8465e720 100644
--- a/include/fused_kernel/core/execution_model/executors.h
+++ b/include/fused_kernel/core/execution_model/executors.h
@@ -23,13 +23,13 @@
 #include <fused_kernel/algorithms/basic_ops/set.h>
 #include <fused_kernel/core/execution_model/stream.h>
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
 #include <fused_kernel/core/execution_model/executor_details/executor_kernels.h>
 #endif
 
 namespace fk {
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
     struct CtxDim3 {
         uint x;
         uint y;
@@ -155,8 +155,9 @@ FK_HOST_FUSE void executeOperations(const std::array<Ptr2D<I>, Batch>& input, co
     struct Executor {
         FK_STATIC_STRUCT(Executor, Executor)
         static_assert(DataParallelPattern::PAR_ARCH == ParArch::GPU_NVIDIA ||
+                      DataParallelPattern::PAR_ARCH == ParArch::GPU_AMD ||
                       DataParallelPattern::PAR_ARCH == ParArch::CPU,
-                      "Only GPU_NVIDIA and CPU supported");
+                      "Only GPU_NVIDIA, GPU_AMD and CPU are supported");
     };
 #endif
 
@@ -188,7 +189,7 @@ FK_HOST_FUSE void executeOperations(const std::array<Ptr2D<I>, Batch>& input, co
         DECLARE_EXECUTOR_PARENT_IMPL
     };
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
     struct ComputeBestSolutionBase {
         FK_HOST_FUSE uint computeDiscardedThreads(const uint width, const uint height, const uint blockDimx, const uint blockDimy) {
             const uint modX = width % blockDimx;
@@ -359,7 +360,109 @@ FK_HOST_FUSE void executeOperations(const std::array<Ptr2D<I>, Batch>& input, co
             executeOperations_helper(stream, iOpSequences...);
         }
     };
-#endif
+#endif // defined(__NVCC__) || CLANG_HOST_DEVICE
+#if HIP_HOST_DEVICE
+    template <enum TF TFEN>
+    struct Executor<TransformDPP<ParArch::GPU_AMD, TFEN>> {
+    private:
+        using Child = Executor<TransformDPP<ParArch::GPU_AMD, TFEN>>;
+        using Parent = BaseExecutor<Child>;
+        template <typename... IOps>
+        FK_HOST_FUSE void executeOperations_helper(Stream_<ParArch::GPU_AMD>& stream_, const IOps&... iOps) {
+            const hipStream_t stream = stream_.getHIPStream();
+            constexpr ParArch PA = ParArch::GPU_AMD;
+            const auto tDetails = TransformDPP<PA, TFEN>::build_details(iOps...);
+            if constexpr (decltype(tDetails)::TFI::ENABLED) {
+                const ActiveThreads activeThreads = tDetails.activeThreads;
+
+                const CtxDim3 ctx_block = getDefaultBlockSize(activeThreads.x, activeThreads.y);
+
+                const dim3 block{ ctx_block.x, ctx_block.y, 1 };
+                const dim3 grid{ static_cast<uint>(ceil(activeThreads.x / static_cast<float>(block.x))),
+                                 static_cast<uint>(ceil(activeThreads.y / static_cast<float>(block.y))),
+                                 activeThreads.z };
+                if (!tDetails.threadDivisible) {
+                    launchTransformDPP_Kernel<ParArch::GPU_AMD, TFEN, false><<<grid, block, 0, stream>>>(tDetails, iOps...);
+                    gpuErrchk(hipGetLastError());
+                } else {
+                    launchTransformDPP_Kernel<ParArch::GPU_AMD, TFEN, true><<<grid, block, 0, stream>>>(tDetails, iOps...);
+                    gpuErrchk(hipGetLastError());
+                }
+            } else {
+                const auto readOp = get_arg<0>(iOps...);
+
+                const ActiveThreads activeThreads = readOp.getActiveThreads();
+
+                const CtxDim3 ctx_block = getDefaultBlockSize(activeThreads.x, activeThreads.y);
+
+                const dim3 block{ ctx_block.x, ctx_block.y, 1 };
+                const dim3 grid{ static_cast<uint>(ceil(activeThreads.x / static_cast<float>(block.x))),
+                                 static_cast<uint>(ceil(activeThreads.y / static_cast<float>(block.y))),
+                                 activeThreads.z };
+                launchTransformDPP_Kernel<ParArch::GPU_AMD, TFEN, true><<<grid, block, 0, stream>>>(tDetails, iOps...);
+                gpuErrchk(hipGetLastError());
+            }
+        }
+    public:
+        FK_STATIC_STRUCT(Executor, Child)
+        FK_HOST_FUSE ParArch parArch() {
+            return ParArch::GPU_AMD;
+        }
+        DECLARE_EXECUTOR_PARENT_IMPL
+    };
+
+    template <typename SequenceSelector>
+    struct Executor<DivergentBatchTransformDPP<ParArch::GPU_AMD, SequenceSelector>> {
+    private:
+        using DPPType = DivergentBatchTransformDPP<ParArch::GPU_AMD, SequenceSelector>;
+        using DPPDetails = typename DPPType::DPPDetails;
+        using SelfType = Executor<DPPType>;
+
+        template <typename... IOpSequenceTypes>
+        FK_HOST_FUSE ActiveThreads getActiveThreads(const IOpSequenceTypes&... iOpSequences) {
+            const uint x = cxp::max::f(get<0>(iOpSequences.iOps).getActiveThreads().x...);
+            const uint y = cxp::max::f(get<0>(iOpSequences.iOps).getActiveThreads().y...);
+            const uint z = cxp::sum::f(get<0>(iOpSequences.iOps).getActiveThreads().z...);
+            return ActiveThreads{ x, y, z };
+        }
+
+        template <typename... IOps>
+        FK_HOST_FUSE auto fuseBackSequence(const IOpSequence<IOps...>& iOpSeq) {
+            return buildOperationSequence_tup(
+                apply([](auto&&... args) {
+                    return BackFuser::fuse_back(std::forward<decltype(args)>(args)...);
+                }, iOpSeq.iOps)
+            );
+        }
+
+        template <typename... IOpSequenceTypes>
+        FK_HOST_FUSE void executeOperationsFused(Stream_<ParArch::GPU_AMD>& stream, const IOpSequenceTypes&... iOpSequences) {
+            const ActiveThreads activeThreads = getActiveThreads(iOpSequences...);
+            const DPPDetails details{};
+
+            const dim3 block(cxp::min::f(activeThreads.x, 32u), cxp::min::f(activeThreads.y, 8u));
+            const dim3 grid(ceil(activeThreads.x / static_cast<float>(block.x)),
+                            ceil(activeThreads.y / static_cast<float>(block.y)), activeThreads.z);
+            launchDivergentBatchTransformDPP_Kernel<ParArch::GPU_AMD, SequenceSelector><<<grid, block, 0, stream.getHIPStream()>>>(details, iOpSequences...);
+            gpuErrchk(hipGetLastError());
+        }
+
+        template <typename... IOpSequenceTypes>
+        FK_HOST_FUSE void executeOperations_helper(Stream_<ParArch::GPU_AMD>& stream, const IOpSequenceTypes&... iOpSequences) {
+            executeOperationsFused(stream, fuseBackSequence(iOpSequences)...);
+        }
+
+    public:
+        FK_STATIC_STRUCT(Executor, SelfType)
+        FK_HOST_FUSE ParArch parArch() {
+            return ParArch::GPU_AMD;
+        }
+        template <typename... IOpSequenceTypes>
+        FK_HOST_FUSE void executeOperations(Stream_<ParArch::GPU_AMD>& stream, const IOpSequenceTypes&... iOpSequences) {
+            executeOperations_helper(stream, iOpSequences...);
+        }
+    };
+#endif // HIP_HOST_DEVICE
 } // namespace fk
 
 #endif // FK_EXECUTORS_CUH
\ No newline at end of file
diff --git a/include/fused_kernel/core/execution_model/parallel_architectures.h b/include/fused_kernel/core/execution_model/parallel_architectures.h
index 487f6bf9..679d3691 100644
--- a/include/fused_kernel/core/execution_model/parallel_architectures.h
+++ b/include/fused_kernel/core/execution_model/parallel_architectures.h
@@ -48,7 +48,12 @@ namespace fk {
 #undef PARALLEL_ARCHITECTURES
 
 #if defined(__NVCC__)
+#ifndef HIP_HOST_DEVICE
+static_assert(false);
+#endif
     constexpr ParArch defaultParArch = ParArch::GPU_NVIDIA;
+#elif HIP_HOST_DEVICE == 1
+    constexpr ParArch defaultParArch = ParArch::GPU_AMD;
 #else
     constexpr ParArch defaultParArch = ParArch::CPU;
 #endif
diff --git a/include/fused_kernel/core/execution_model/stream.h b/include/fused_kernel/core/execution_model/stream.h
index 9803eb03..1dc0e756 100644
--- a/include/fused_kernel/core/execution_model/stream.h
+++ b/include/fused_kernel/core/execution_model/stream.h
@@ -20,8 +20,8 @@
 
 #if defined(__NVCC__)
 #include <fused_kernel/core/utils/utils.h>
-#elif defined(__HIP__)
-#include <hip/hip_runtime.h>
+#elif HIP_HOST_DEVICE
+#include <fused_kernel/core/utils/utils.h>
 #endif
 
 namespace fk {
@@ -109,6 +109,67 @@ namespace fk {
     };
 #endif
 
+#if HIP_HOST_DEVICE
+    template <>
+    class Stream_<ParArch::GPU_AMD> final : public BaseStream {
+        hipStream_t m_stream;
+        bool m_isMine{ false };
+
+        inline void initFromOther(const Stream_<ParArch::GPU_AMD>& other) {
+            m_stream = other.m_stream;
+            m_isMine = other.m_isMine;
+        }
+
+    public:
+        Stream_() : BaseStream() {
+            gpuErrchk(hipStreamCreate(&m_stream));
+            m_isMine = true;
+        }
+        Stream_(const Stream_<ParArch::GPU_AMD>& other) : BaseStream(other) {
+            initFromOther(other);
+        }
+        explicit Stream_<ParArch::GPU_AMD>(const hipStream_t& stream) : m_stream(stream), BaseStream() {}
+
+        hipStream_t operator()() const {
+            return m_stream;
+        }
+
+        Stream_<ParArch::GPU_AMD>& operator=(const Stream_<ParArch::GPU_AMD>& other) {
+            if (this != &other) {
+                BaseStream::operator=(other);
+                initFromOther(other);
+            }
+            return *this;
+        }
+
+        Stream_(Stream_<ParArch::GPU_AMD>&&) = delete;
+        Stream_<ParArch::GPU_AMD>& operator=(Stream_<ParArch::GPU_AMD>&&) = delete;
+
+        ~Stream_() {
+            if ( this->getRefCount() == 0 && m_stream != 0 && m_isMine) {
+                sync();
+                gpuErrchk(hipStreamDestroy(m_stream));
+            }
+        }
+
+        operator hipStream_t() const {
+            return m_stream;
+        }
+        inline hipStream_t getHIPStream() const {
+            return m_stream;
+        }
+        inline void sync() final {
+            gpuErrchk(hipStreamSynchronize(m_stream));
+        }
+        constexpr inline enum ParArch getParArch() const {
+            return ParArch::GPU_AMD;
+        };
+        static constexpr inline enum ParArch parArch() {
+            return ParArch::GPU_AMD;
+        }
+    };
+#endif
+
     template <>
     class Stream_<ParArch::CPU> final : public BaseStream {
     public:
diff --git a/include/fused_kernel/core/utils/compiler_macros.h b/include/fused_kernel/core/utils/compiler_macros.h
index 41d19186..9795c8d2 100644
--- a/include/fused_kernel/core/utils/compiler_macros.h
+++ b/include/fused_kernel/core/utils/compiler_macros.h
@@ -22,4 +22,11 @@
 #define _MSC_VER_EXISTS 0
 #endif
 
+// HIP platform detection: __HIP__ is defined by clang when compiling HIP code
+#if defined(__HIP__)
+#define HIP_HOST_DEVICE 1
+#else
+#define HIP_HOST_DEVICE 0
+#endif
+
 #endif // COMPILER_MACROS_H
\ No newline at end of file
diff --git a/include/fused_kernel/core/utils/utils.h b/include/fused_kernel/core/utils/utils.h
index 307aa2b3..92c0545e 100644
--- a/include/fused_kernel/core/utils/utils.h
+++ b/include/fused_kernel/core/utils/utils.h
@@ -23,6 +23,8 @@
 
 #if defined(__NVCC__)
 #include <cuda_runtime.h>
+#elif HIP_HOST_DEVICE
+#include <hip/hip_runtime.h>
 #endif
 
 #if defined(NVRTC_ENABLED)
@@ -31,7 +33,7 @@
 #endif
 #endif // NVRTC_COMPILER
 
-#if defined(__NVCC__)
+#if defined(__NVCC__) || HIP_HOST_DEVICE
 #define FK_DEVICE_FUSE __device__ __forceinline__ static constexpr
 #define FK_DEVICE_CNST __device__ __forceinline__ constexpr
 #define FK_HOST_DEVICE_FUSE __host__ FK_DEVICE_FUSE
@@ -147,7 +149,27 @@ namespace fk {
 } // namespace fk
 
 #define gpuErrchk(ans) { fk::gpuAssert((ans), __FILE__, __LINE__, true); }
-#endif // defined(__NVCC__)
+#elif HIP_HOST_DEVICE
+namespace fk {
+    inline void gpuAssert(hipError_t code,
+                          const char *file,
+                          int line,
+                          bool abort = true) {
+        if (code != hipSuccess) {
+            std::string message = "GPU Error: ";
+            message.append(hipGetErrorString(code));
+            message.append(" File: ");
+            message.append(file);
+            message.append(" Line:");
+            message.append(std::to_string(line).c_str());
+            message.append("\n");
+            if (abort) throw std::runtime_error(message.c_str());
+        }
+    }
+} // namespace fk
+
+#define gpuErrchk(ans) { fk::gpuAssert((ans), __FILE__, __LINE__, true); }
+#endif // (__NVCC__)  || HIP_HOST_DEVICE
 
 // Null type, used for Operation required aliases that can not still be known,
 // because they are deduced from a backwards operation that is till not defined.
diff --git a/tests/main.cpp.hip b/tests/main.cpp.hip
new file mode 100644
index 00000000..34e1649c
--- /dev/null
+++ b/tests/main.cpp.hip
@@ -0,0 +1,23 @@
+﻿/* Copyright 2023 Oscar Amoros Huguet
+   Copyright 2023 Albert Andaluz Gonzalez
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+ 
+
+#include <hip/hip_runtime.h>
+#include <tests/main.h>
+int main(int argc, char **argv) {
+ return launch();
+}
+
diff --git a/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt b/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt
index ea9fba20..369b5978 100644
--- a/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt
+++ b/utests/algorithm/image_processing/utest_saturate/CMakeLists.txt
@@ -44,6 +44,27 @@ if (CMAKE_CUDA_COMPILER AND ENABLE_CUDA)
     endforeach()
 
 endif()
-  
 
- 
\ No newline at end of file
+if (CMAKE_HIP_COMPILER AND ENABLE_HIP)    
+    set(EXTENSION hip)       
+    string(TOUPPER ${EXTENSION} EXTENSION_UPPER)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_NAME}.h.in 
+    ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${TARGET_NAME}_${EXTENSION}.h)
+
+    add_generated_test_stub("${TARGET_NAME}_hip" 
+    "${CMAKE_BINARY_DIR}/${TARGET_NAME}/${TARGET_NAME}_${EXTENSION}.h" 
+    "")      
+    set_ide_target_folder(${TARGET_NAME}_${EXTENSION} "/utests/algorithm/image_processing" "${EXTENSION}")
+    
+    add_hip_to_test("${TARGET_NAME}_hip")    
+    foreach(FUNDAMENTAL_TYPE ${FUNDAMENTAL_TYPES})   
+        add_shared_test_lib(${TARGET_NAME} 
+        "" 
+        "${EXTENSION}"
+        "${FUNDAMENTAL_TYPE}"
+        )    
+        set_property(TARGET "${TARGET_NAME}_${FUNDAMENTAL_TYPE}_${EXTENSION}" PROPERTY FOLDER "/utests/algorithm/image_processing/hip/${TARGET_NAME}")                   
+        target_link_libraries(${TARGET_NAME}_${EXTENSION} PRIVATE "${TARGET_NAME}_${FUNDAMENTAL_TYPE}_${EXTENSION}")
+    endforeach()
+
+endif() 
\ No newline at end of file
diff --git a/utests/algorithm/image_processing/utest_saturate/utest_saturate_ftype.hip.in b/utests/algorithm/image_processing/utest_saturate/utest_saturate_ftype.hip.in
new file mode 100644
index 00000000..811a5942
--- /dev/null
+++ b/utests/algorithm/image_processing/utest_saturate/utest_saturate_ftype.hip.in
@@ -0,0 +1,26 @@
+﻿/* Copyright 2025 Oscar Amoros Huguet
+   Copyright 2025 Grup Mediapro S.L.U
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "utest_saturate_@FUNDAMENTAL_TYPE@_@EXTENSION@.h"
+
+START_ADDING_TESTS
+using Fundamental = fk::RemoveType_t<0, fk::StandardTypes>;
+addAllOutputTestsForInput<Fundamental, @FUNDAMENTAL_TYPE@>(std::make_index_sequence<Fundamental::size>{});
+STOP_ADDING_TESTS
+
+int fk::utest_saturate_@FUNDAMENTAL_TYPE@_@EXTENSION@::launch() {
+    RUN_ALL_TESTS
+    return 0;
+}
\ No newline at end of file