diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
index 1b284974c..56f095a3e 100644
--- a/BIBLIOGRAPHY.md
+++ b/BIBLIOGRAPHY.md
@@ -223,8 +223,8 @@ source code and documentation.
   - [dev/x86_64/src/poly_chknorm_avx2.c](dev/x86_64/src/poly_chknorm_avx2.c)
   - [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c)
   - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
-  - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
-  - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
+  - [dev/x86_64/src/poly_use_hint_32_avx2.S](dev/x86_64/src/poly_use_hint_32_avx2.S)
+  - [dev/x86_64/src/poly_use_hint_88_avx2.S](dev/x86_64/src/poly_use_hint_88_avx2.S)
   - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c)
   - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c)
   - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
@@ -241,8 +241,8 @@ source code and documentation.
   - [mldsa/src/native/x86_64/src/poly_chknorm_avx2.c](mldsa/src/native/x86_64/src/poly_chknorm_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
-  - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
-  - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
+  - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S)
+  - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S)
   - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c)
   - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c)
   - [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)
diff --git a/dev/x86_64/src/poly_use_hint_32_avx2.S b/dev/x86_64/src/poly_use_hint_32_avx2.S
new file mode 100644
index 000000000..cd7939c8d
--- /dev/null
+++ b/dev/x86_64/src/poly_use_hint_32_avx2.S
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+
+/*************************************************
+ * Name:        mld_poly_use_hint
+ *
+ * Description: Use hint polynomial to correct the high bits of a polynomial.
+ *
+ * Arguments:   - mld_poly *b: pointer to output polynomial with corrected high
+ *                bits
+ *              - const mld_poly *a: pointer to input polynomial
+ *              - const mld_poly *hint: pointer to input hint polynomial
+ **************************************************/
+
+
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
+
+
+/* simpasm: header-end */
+
+/* Reference:
+ * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
+ * - Our implementation of decompose() is slightly different from that in
+ *   @[REF_AVX2]. See poly_decompose_32_avx2 for more information.
+ */
+
+// a aliased with a0
+.macro decompose32_avx2 a1, a, temp1, temp2, temp3
+/* a1, a0 = decompose(a)
+ * See poly_decompose_32_avx2 for more information. */
+vpaddd	\a, %ymm5,  \temp1
+vpsrld	$7, \temp1, \temp1
+vpmulhuw	%ymm8, \temp1, \temp1
+vpmulhrsw	%ymm7, \temp1, \temp1
+/* Check for wrap-around; set a1 = 0 if required */
+vpcmpgtd	%ymm4, \a, \temp2
+vpandn	\temp1, \temp2, \a1
+/* Compute remainder a0 */
+vpslld	$10, \temp1, \temp3
+vpsubd	\temp1, \temp3, \temp1
+vpslld	$9, \temp1, \temp1
+vpsubd	\temp1, \a, \a
+/* If wrap-around is required, a0 -= 1 */
+vpaddd	\temp2, \a, \a
+.endm
+
+/* Reference: The reference avx2 implementation checks a0 >= 0, which is
+ * different from the specification and the reference C implementation. We
+ * follow the specification and check a0 > 0.
+ */
+
+// a aliased with delta
+.macro use_hint32_avx2 b, a, h, a1, temp1, temp2, temp3
+decompose32_avx2 \a1, \a, \temp1, \temp2, \temp3
+
+/* h = (a0 > 0) ? h : -h */
+vpcmpgtd	%ymm6, \a, \a
+vpandn	\h, \a, \a
+vpslld	$1, \a, \a
+vpsubd	\a, \h, \h
+
+/* b = (a1 + h) % 16 */
+vpaddd	\a1, \h, \b
+vpand	%ymm3, \b, \b
+.endm
+
+.text
+.balign 16
+.global MLD_ASM_NAMESPACE(mld_poly_use_hint_32_avx2)
+MLD_ASM_FN_SYMBOL(mld_poly_use_hint_32_avx2)
+
+// Initialize constants
+movl	$127, %ecx
+
+/* check-magic: 1025 == floor(2^22 / 4092) */
+movl $1025, %r8d
+vmovd %r8d, %xmm8
+vpbroadcastd %xmm8, %ymm8
+
+xorl	%eax, %eax
+vpxor	%xmm6, %xmm6, %xmm6
+vmovd	%ecx, %xmm5
+
+/* 87 * ((Q-1) / 32), wrap-around threshold */
+movl	$22784256, %ecx
+
+/* round(x * 2^9 / 2^15) => round(x / 2^6), for f1 = round(f1''/ 2^6)*/
+movl $512, %r9d              
+vmovd %r9d, %xmm7              
+vpbroadcastd %xmm7, %ymm7     
+
+vmovd	%ecx, %xmm4
+movl	$15, %ecx
+vpbroadcastd	%xmm5, %ymm5
+vmovd	%ecx, %xmm3
+vpbroadcastd	%xmm4, %ymm4
+vpbroadcastd	%xmm3, %ymm3
+
+
+mld_poly_use_hint_32_avx2_loop:
+vmovdqa	(%rsi,%rax), %ymm0
+vmovdqa	(%rdx,%rax), %ymm2
+
+use_hint32_avx2 %ymm2, %ymm0, %ymm2, %ymm9, %ymm1, %ymm11, %ymm10
+
+vmovdqa	%ymm2, (%rdi,%rax)
+addq	$32, %rax
+cmpq	$1024, %rax
+jne	mld_poly_use_hint_32_avx2_loop
+ret
+
+/* simpasm: footer-start */
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+          && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+          65 || MLD_CONFIG_PARAMETER_SET == 87) */
diff --git a/dev/x86_64/src/poly_use_hint_32_avx2.c b/dev/x86_64/src/poly_use_hint_32_avx2.c
deleted file mode 100644
index 9a5c866bb..000000000
--- a/dev/x86_64/src/poly_use_hint_32_avx2.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) The mldsa-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- *   CRYSTALS-Dilithium optimized AVX2 implementation
- *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
- *   https://github.com/pq-crystals/dilithium/tree/master/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Dilithium implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
-    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
-    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
-     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
-
-#include <immintrin.h>
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-#define MLD_MM256_BLENDV_EPI32(a, b, mask)                     \
-  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
-                                       _mm256_castsi256_ps(b), \
-                                       _mm256_castsi256_ps(mask)))
-
-void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a,
-                               const int32_t *hint)
-{
-  unsigned int i;
-  __m256i f, f0, f1, h, t;
-  const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 32));
-  /* check-magic: 1025 == floor(2**22 / 4092) */
-  const __m256i v = _mm256_set1_epi32(1025);
-  const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32));
-  const __m256i off = _mm256_set1_epi32(127);
-  const __m256i shift = _mm256_set1_epi32(512);
-  const __m256i mask = _mm256_set1_epi32(15);
-  const __m256i zero = _mm256_setzero_si256();
-
-  for (i = 0; i < MLDSA_N / 8; i++)
-  {
-    f = _mm256_load_si256((const __m256i *)&a[8 * i]);
-    h = _mm256_load_si256((const __m256i *)&hint[8 * i]);
-
-    /* Reference:
-     * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
-     * - Our implementation of decompose() is slightly different from that in
-     *   @[REF_AVX2]. See poly_decompose_32_avx2.c for more information.
-     */
-    /* f1, f2 = decompose(f) */
-    f1 = _mm256_add_epi32(f, off);
-    f1 = _mm256_srli_epi32(f1, 7);
-    f1 = _mm256_mulhi_epu16(f1, v);
-    f1 = _mm256_mulhrs_epi16(f1, shift);
-    t = _mm256_cmpgt_epi32(f, q_bound);
-    f0 = _mm256_mullo_epi32(f1, alpha);
-    f0 = _mm256_sub_epi32(f, f0);
-    f1 = _mm256_andnot_si256(t, f1);
-    f0 = _mm256_add_epi32(f0, t);
-
-    /* Reference: The reference avx2 implementation checks a0 >= 0, which is
-     * different from the specification and the reference C implementation. We
-     * follow the specification and check a0 > 0.
-     */
-    /* t = (f0 > 0) ? h : -h */
-    f0 = _mm256_cmpgt_epi32(f0, zero);
-    t = MLD_MM256_BLENDV_EPI32(h, zero, f0);
-    t = _mm256_slli_epi32(t, 1);
-    h = _mm256_sub_epi32(h, t);
-
-    /* f1 = (f1 + t) % 16 */
-    f1 = _mm256_add_epi32(f1, h);
-    f1 = _mm256_and_si256(f1, mask);
-
-    _mm256_store_si256((__m256i *)&b[8 * i], f1);
-  }
-}
-
-#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
-         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
-         65 || MLD_CONFIG_PARAMETER_SET == 87) */
-
-MLD_EMPTY_CU(avx2_poly_use_hint_32)
-
-#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                                \
-          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                                  \
-          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
-          || MLD_CONFIG_PARAMETER_SET == 87)) */
-
-/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
- * Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef MLD_MM256_BLENDV_EPI32
diff --git a/dev/x86_64/src/poly_use_hint_88_avx2.S b/dev/x86_64/src/poly_use_hint_88_avx2.S
new file mode 100644
index 000000000..08bec8d09
--- /dev/null
+++ b/dev/x86_64/src/poly_use_hint_88_avx2.S
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+
+/*************************************************
+ * Name:        mld_poly_use_hint
+ *
+ * Description: Use hint polynomial to correct the high bits of a polynomial.
+ *
+ * Arguments:   - mld_poly *b: pointer to output polynomial with corrected high
+ *                bits
+ *              - const mld_poly *a: pointer to input polynomial
+ *              - const mld_poly *hint: pointer to input hint polynomial
+ **************************************************/
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    MLD_CONFIG_PARAMETER_SET == 44)
+/* simpasm: header-end */
+
+/* Reference:
+ * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
+ * - Our implementation of decompose() is slightly different from that in
+ *   @[REF_AVX2]. See poly_decompose_88_avx2 for more information.
+ */
+
+// a aliased with a0
+.macro decompose88_avx2 a1, a, temp1, temp2, temp3
+/* a1, a0 = decompose(a)
+ * See poly_decompose_88_avx2 for more information. */
+vpaddd	\a, %ymm4, \temp1
+vpsrld	$7, \temp1, \temp1
+vpmulhuw	%ymm8, \temp1, \temp1
+vpmulhrsw	%ymm7, \temp1, \temp1
+/* Check for wrap-around; set a1 = 0 if required */
+vpcmpgtd	%ymm3, \a, \temp2
+vpandn	\temp1, \temp2, \a1
+/* Compute remainder a0 */
+vpslld	$1, \temp1, \temp3
+vpaddd	\temp1, \temp3, \temp3
+vpslld	$5, \temp3, \temp1
+vpsubd	\temp3, \temp1, \temp1
+vpslld	$11, \temp1, \temp1
+vpsubd	\temp1, \a, \a
+/* If wrap-around is required, a0 -= 1 */
+vpaddd	\temp2, \a, \a
+.endm
+
+/* Reference: The reference avx2 implementation checks a0 >= 0, which is
+ * different from the specification and the reference C implementation. We
+ * follow the specification and check a0 > 0.
+ */
+
+// a aliased with delta
+.macro use_hint88_avx2 b, a, h, a1, temp1, temp2, temp3
+decompose88_avx2 \a1, \a, \temp1, \temp2, \temp3
+
+/* h = (a0 > 0) ? h : -h */
+vpcmpgtd	%ymm5, \a, \a
+vpandn	\h, \a, \a
+vpslld	$1, \a, \a
+vpsubd	\a, \h, \a
+
+/* b = (a1 + h) % 44 */
+vpaddd	\a1, \a, \b
+vblendvps	\b, %ymm6, \b, \b
+vpcmpgtd	%ymm6, \b, \h
+vpandn	\b, \h, \b
+.endm
+
+.text
+.balign 16
+.global MLD_ASM_NAMESPACE(mld_poly_use_hint_88_avx2)  
+MLD_ASM_FN_SYMBOL(mld_poly_use_hint_88_avx2)
+
+// Initialize constants
+movl	$127, %ecx
+xorl	%eax, %eax
+vpxor	%xmm5, %xmm5, %xmm5
+
+/* check-magic: 11275 == floor(2^24 / 1488) */
+movl $11275, %r8d
+vmovd %r8d, %xmm8
+vpbroadcastd %xmm8, %ymm8
+
+vmovd	%ecx, %xmm4
+
+/* 87 * ((Q-1) / 88), wrap-around threshold */
+movl	$8285184, %ecx
+
+/* round(x * 2^7 / 2^15) => round(x / 2^8), for f1 = round(f1''/ 2^8)*/
+movl $128, %r9d
+vmovd %r9d, %xmm7
+vpbroadcastd %xmm7, %ymm7
+
+ /* max a1 value */
+movl $43, %r10d
+vmovd %r10d, %xmm6
+vpbroadcastd %xmm6, %ymm6
+
+vmovd	%ecx, %xmm3
+vpbroadcastd	%xmm4, %ymm4
+vpbroadcastd	%xmm3, %ymm3
+
+mld_poly_use_hint_88_avx2_loop:
+vmovdqa	(%rsi,%rax), %ymm0
+vmovdqa	(%rdx,%rax), %ymm1
+
+use_hint88_avx2 %ymm0, %ymm0, %ymm1, %ymm9, %ymm10, %ymm11, %ymm12
+
+vmovdqa	%ymm0, (%rdi,%rax)
+addq	$32, %rax
+cmpq	$1024, %rax
+jne	mld_poly_use_hint_88_avx2_loop
+
+ret
+
+/* simpasm: footer-start */
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+          && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+          44) */
+
diff --git a/dev/x86_64/src/poly_use_hint_88_avx2.c b/dev/x86_64/src/poly_use_hint_88_avx2.c
deleted file mode 100644
index 32fa2e6c6..000000000
--- a/dev/x86_64/src/poly_use_hint_88_avx2.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) The mldsa-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- *   CRYSTALS-Dilithium optimized AVX2 implementation
- *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
- *   https://github.com/pq-crystals/dilithium/tree/master/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Dilithium implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
-    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
-    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
-     MLD_CONFIG_PARAMETER_SET == 44)
-
-#include <immintrin.h>
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-#define MLD_MM256_BLENDV_EPI32(a, b, mask)                     \
-  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
-                                       _mm256_castsi256_ps(b), \
-                                       _mm256_castsi256_ps(mask)))
-
-void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a,
-                               const int32_t *hint)
-{
-  unsigned int i;
-  __m256i f, f0, f1, h, t;
-  const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 88));
-  /* check-magic: 11275 == floor(2**24 / 1488) */
-  const __m256i v = _mm256_set1_epi32(11275);
-  const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 88));
-  const __m256i off = _mm256_set1_epi32(127);
-  const __m256i shift = _mm256_set1_epi32(128);
-  const __m256i max = _mm256_set1_epi32(43);
-  const __m256i zero = _mm256_setzero_si256();
-
-  for (i = 0; i < MLDSA_N / 8; i++)
-  {
-    f = _mm256_load_si256((const __m256i *)&a[8 * i]);
-    h = _mm256_load_si256((const __m256i *)&hint[8 * i]);
-
-    /* Reference:
-     * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
-     * - Our implementation of decompose() is slightly different from that in
-     *   @[REF_AVX2]. See poly_decompose_88_avx2.c for more information.
-     */
-    /* f1, f2 = decompose(f) */
-    f1 = _mm256_add_epi32(f, off);
-    f1 = _mm256_srli_epi32(f1, 7);
-    f1 = _mm256_mulhi_epu16(f1, v);
-    f1 = _mm256_mulhrs_epi16(f1, shift);
-    t = _mm256_cmpgt_epi32(f, q_bound);
-    f0 = _mm256_mullo_epi32(f1, alpha);
-    f0 = _mm256_sub_epi32(f, f0);
-    f1 = _mm256_andnot_si256(t, f1);
-    f0 = _mm256_add_epi32(f0, t);
-
-    /* Reference: The reference avx2 implementation checks a0 >= 0, which is
-     * different from the specification and the reference C implementation. We
-     * follow the specification and check a0 > 0.
-     */
-    /* t = (f0 > 0) ? h : -h */
-    f0 = _mm256_cmpgt_epi32(f0, zero);
-    t = MLD_MM256_BLENDV_EPI32(h, zero, f0);
-    t = _mm256_slli_epi32(t, 1);
-    h = _mm256_sub_epi32(h, t);
-
-    /* f1 = (f1 + t) % 44 */
-    f1 = _mm256_add_epi32(f1, h);
-    f1 = MLD_MM256_BLENDV_EPI32(f1, max, f1);
-    f = _mm256_cmpgt_epi32(f1, max);
-    f1 = MLD_MM256_BLENDV_EPI32(f1, zero, f);
-
-    _mm256_store_si256((__m256i *)&b[8 * i], f1);
-  }
-}
-
-#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
-         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
-         44) */
-
-MLD_EMPTY_CU(avx2_poly_use_hint_88)
-
-#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                             \
-          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                               \
-          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
-          44)) */
-
-/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
- * Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef MLD_MM256_BLENDV_EPI32
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index 9ad43f785..735ff937a 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -85,8 +85,6 @@
 #include "src/native/x86_64/src/poly_chknorm_avx2.c"
 #include "src/native/x86_64/src/poly_decompose_32_avx2.c"
 #include "src/native/x86_64/src/poly_decompose_88_avx2.c"
-#include "src/native/x86_64/src/poly_use_hint_32_avx2.c"
-#include "src/native/x86_64/src/poly_use_hint_88_avx2.c"
 #include "src/native/x86_64/src/polyz_unpack_17_avx2.c"
 #include "src/native/x86_64/src/polyz_unpack_19_avx2.c"
 #include "src/native/x86_64/src/rej_uniform_avx2.c"
diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S
index f0c54203f..e427ce701 100644
--- a/mldsa/mldsa_native_asm.S
+++ b/mldsa/mldsa_native_asm.S
@@ -87,6 +87,8 @@
 #include "src/native/x86_64/src/pointwise_acc_l5.S"
 #include "src/native/x86_64/src/pointwise_acc_l7.S"
 #include "src/native/x86_64/src/poly_caddq_avx2.S"
+#include "src/native/x86_64/src/poly_use_hint_32_avx2.S"
+#include "src/native/x86_64/src/poly_use_hint_88_avx2.S"
 #endif /* MLD_SYS_X86_64 */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */
 
diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S b/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S
new file mode 100644
index 000000000..626c13d6b
--- /dev/null
+++ b/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+
+/*************************************************
+ * Name:        mld_poly_use_hint
+ *
+ * Description: Use hint polynomial to correct the high bits of a polynomial.
+ *
+ * Arguments:   - mld_poly *b: pointer to output polynomial with corrected high
+ *                bits
+ *              - const mld_poly *a: pointer to input polynomial
+ *              - const mld_poly *hint: pointer to input hint polynomial
+ **************************************************/
+
+
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
+
+
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/x86_64/src/poly_use_hint_32_avx2.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(mld_poly_use_hint_32_avx2)
+MLD_ASM_FN_SYMBOL(mld_poly_use_hint_32_avx2)
+
+        .cfi_startproc
+        movl	$0x7f, %ecx
+        movl	$0x401, %r8d            # imm = 0x401
+        vmovd	%r8d, %xmm8
+        vpbroadcastd	%xmm8, %ymm8
+        xorl	%eax, %eax
+        vpxor	%xmm6, %xmm6, %xmm6
+        vmovd	%ecx, %xmm5
+        movl	$0x15ba900, %ecx        # imm = 0x15BA900
+        movl	$0x200, %r9d            # imm = 0x200
+        vmovd	%r9d, %xmm7
+        vpbroadcastd	%xmm7, %ymm7
+        vmovd	%ecx, %xmm4
+        movl	$0xf, %ecx
+        vpbroadcastd	%xmm5, %ymm5
+        vmovd	%ecx, %xmm3
+        vpbroadcastd	%xmm4, %ymm4
+        vpbroadcastd	%xmm3, %ymm3
+
+Lmld_poly_use_hint_32_avx2_loop:
+        vmovdqa	(%rsi,%rax), %ymm0
+        vmovdqa	(%rdx,%rax), %ymm2
+        vpaddd	%ymm0, %ymm5, %ymm1
+        vpsrld	$0x7, %ymm1, %ymm1
+        vpmulhuw	%ymm8, %ymm1, %ymm1
+        vpmulhrsw	%ymm7, %ymm1, %ymm1
+        vpcmpgtd	%ymm4, %ymm0, %ymm11
+        vpandn	%ymm1, %ymm11, %ymm9
+        vpslld	$0xa, %ymm1, %ymm10
+        vpsubd	%ymm1, %ymm10, %ymm1
+        vpslld	$0x9, %ymm1, %ymm1
+        vpsubd	%ymm1, %ymm0, %ymm0
+        vpaddd	%ymm11, %ymm0, %ymm0
+        vpcmpgtd	%ymm6, %ymm0, %ymm0
+        vpandn	%ymm2, %ymm0, %ymm0
+        vpslld	$0x1, %ymm0, %ymm0
+        vpsubd	%ymm0, %ymm2, %ymm2
+        vpaddd	%ymm9, %ymm2, %ymm2
+        vpand	%ymm3, %ymm2, %ymm2
+        vmovdqa	%ymm2, (%rdi,%rax)
+        addq	$0x20, %rax
+        cmpq	$0x400, %rax            # imm = 0x400
+        jne	Lmld_poly_use_hint_32_avx2_loop
+        retq
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(mld_poly_use_hint_32_avx2)
+
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+          && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+          65 || MLD_CONFIG_PARAMETER_SET == 87) */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c b/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c
deleted file mode 100644
index 9a5c866bb..000000000
--- a/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) The mldsa-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- *   CRYSTALS-Dilithium optimized AVX2 implementation
- *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
- *   https://github.com/pq-crystals/dilithium/tree/master/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Dilithium implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
-    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
-    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
-     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
-
-#include <immintrin.h>
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-#define MLD_MM256_BLENDV_EPI32(a, b, mask)                     \
-  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
-                                       _mm256_castsi256_ps(b), \
-                                       _mm256_castsi256_ps(mask)))
-
-void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a,
-                               const int32_t *hint)
-{
-  unsigned int i;
-  __m256i f, f0, f1, h, t;
-  const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 32));
-  /* check-magic: 1025 == floor(2**22 / 4092) */
-  const __m256i v = _mm256_set1_epi32(1025);
-  const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32));
-  const __m256i off = _mm256_set1_epi32(127);
-  const __m256i shift = _mm256_set1_epi32(512);
-  const __m256i mask = _mm256_set1_epi32(15);
-  const __m256i zero = _mm256_setzero_si256();
-
-  for (i = 0; i < MLDSA_N / 8; i++)
-  {
-    f = _mm256_load_si256((const __m256i *)&a[8 * i]);
-    h = _mm256_load_si256((const __m256i *)&hint[8 * i]);
-
-    /* Reference:
-     * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
-     * - Our implementation of decompose() is slightly different from that in
-     *   @[REF_AVX2]. See poly_decompose_32_avx2.c for more information.
-     */
-    /* f1, f2 = decompose(f) */
-    f1 = _mm256_add_epi32(f, off);
-    f1 = _mm256_srli_epi32(f1, 7);
-    f1 = _mm256_mulhi_epu16(f1, v);
-    f1 = _mm256_mulhrs_epi16(f1, shift);
-    t = _mm256_cmpgt_epi32(f, q_bound);
-    f0 = _mm256_mullo_epi32(f1, alpha);
-    f0 = _mm256_sub_epi32(f, f0);
-    f1 = _mm256_andnot_si256(t, f1);
-    f0 = _mm256_add_epi32(f0, t);
-
-    /* Reference: The reference avx2 implementation checks a0 >= 0, which is
-     * different from the specification and the reference C implementation. We
-     * follow the specification and check a0 > 0.
-     */
-    /* t = (f0 > 0) ? h : -h */
-    f0 = _mm256_cmpgt_epi32(f0, zero);
-    t = MLD_MM256_BLENDV_EPI32(h, zero, f0);
-    t = _mm256_slli_epi32(t, 1);
-    h = _mm256_sub_epi32(h, t);
-
-    /* f1 = (f1 + t) % 16 */
-    f1 = _mm256_add_epi32(f1, h);
-    f1 = _mm256_and_si256(f1, mask);
-
-    _mm256_store_si256((__m256i *)&b[8 * i], f1);
-  }
-}
-
-#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
-         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
-         65 || MLD_CONFIG_PARAMETER_SET == 87) */
-
-MLD_EMPTY_CU(avx2_poly_use_hint_32)
-
-#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                                \
-          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                                  \
-          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
-          || MLD_CONFIG_PARAMETER_SET == 87)) */
-
-/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
- * Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef MLD_MM256_BLENDV_EPI32
diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S b/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S
new file mode 100644
index 000000000..407233d7b
--- /dev/null
+++ b/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+
+/*************************************************
+ * Name:        mld_poly_use_hint
+ *
+ * Description: Use hint polynomial to correct the high bits of a polynomial.
+ *
+ * Arguments:   - mld_poly *b: pointer to output polynomial with corrected high
+ *                bits
+ *              - const mld_poly *a: pointer to input polynomial
+ *              - const mld_poly *hint: pointer to input hint polynomial
+ **************************************************/
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    MLD_CONFIG_PARAMETER_SET == 44)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/x86_64/src/poly_use_hint_88_avx2.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(mld_poly_use_hint_88_avx2)  
+MLD_ASM_FN_SYMBOL(mld_poly_use_hint_88_avx2)  
+
+        .cfi_startproc
+        movl	$0x7f, %ecx
+        xorl	%eax, %eax
+        vpxor	%xmm5, %xmm5, %xmm5
+        movl	$0x2c0b, %r8d           # imm = 0x2C0B
+        vmovd	%r8d, %xmm8
+        vpbroadcastd	%xmm8, %ymm8
+        vmovd	%ecx, %xmm4
+        movl	$0x7e6c00, %ecx         # imm = 0x7E6C00
+        movl	$0x80, %r9d
+        vmovd	%r9d, %xmm7
+        vpbroadcastd	%xmm7, %ymm7
+        movl	$0x2b, %r10d
+        vmovd	%r10d, %xmm6
+        vpbroadcastd	%xmm6, %ymm6
+        vmovd	%ecx, %xmm3
+        vpbroadcastd	%xmm4, %ymm4
+        vpbroadcastd	%xmm3, %ymm3
+
+Lmld_poly_use_hint_88_avx2_loop:
+        vmovdqa	(%rsi,%rax), %ymm0
+        vmovdqa	(%rdx,%rax), %ymm1
+        vpaddd	%ymm0, %ymm4, %ymm10
+        vpsrld	$0x7, %ymm10, %ymm10
+        vpmulhuw	%ymm8, %ymm10, %ymm10
+        vpmulhrsw	%ymm7, %ymm10, %ymm10
+        vpcmpgtd	%ymm3, %ymm0, %ymm11
+        vpandn	%ymm10, %ymm11, %ymm9
+        vpslld	$0x1, %ymm10, %ymm12
+        vpaddd	%ymm10, %ymm12, %ymm12
+        vpslld	$0x5, %ymm12, %ymm10
+        vpsubd	%ymm12, %ymm10, %ymm10
+        vpslld	$0xb, %ymm10, %ymm10
+        vpsubd	%ymm10, %ymm0, %ymm0
+        vpaddd	%ymm11, %ymm0, %ymm0
+        vpcmpgtd	%ymm5, %ymm0, %ymm0
+        vpandn	%ymm1, %ymm0, %ymm0
+        vpslld	$0x1, %ymm0, %ymm0
+        vpsubd	%ymm0, %ymm1, %ymm0
+        vpaddd	%ymm9, %ymm0, %ymm0
+        vblendvps	%ymm0, %ymm6, %ymm0, %ymm0
+        vpcmpgtd	%ymm6, %ymm0, %ymm1
+        vpandn	%ymm0, %ymm1, %ymm0
+        vmovdqa	%ymm0, (%rdi,%rax)
+        addq	$0x20, %rax
+        cmpq	$0x400, %rax            # imm = 0x400
+        jne	Lmld_poly_use_hint_88_avx2_loop
+        retq
+        .cfi_endproc
+
+MLD_ASM_FN_SIZE(mld_poly_use_hint_88_avx2)  
+
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+          && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+          44) */
+
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c b/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c
deleted file mode 100644
index 32fa2e6c6..000000000
--- a/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) The mldsa-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- *   CRYSTALS-Dilithium optimized AVX2 implementation
- *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
- *   https://github.com/pq-crystals/dilithium/tree/master/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Dilithium implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
-    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
-    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
-     MLD_CONFIG_PARAMETER_SET == 44)
-
-#include <immintrin.h>
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-#define MLD_MM256_BLENDV_EPI32(a, b, mask)                     \
-  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
-                                       _mm256_castsi256_ps(b), \
-                                       _mm256_castsi256_ps(mask)))
-
-void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a,
-                               const int32_t *hint)
-{
-  unsigned int i;
-  __m256i f, f0, f1, h, t;
-  const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 88));
-  /* check-magic: 11275 == floor(2**24 / 1488) */
-  const __m256i v = _mm256_set1_epi32(11275);
-  const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 88));
-  const __m256i off = _mm256_set1_epi32(127);
-  const __m256i shift = _mm256_set1_epi32(128);
-  const __m256i max = _mm256_set1_epi32(43);
-  const __m256i zero = _mm256_setzero_si256();
-
-  for (i = 0; i < MLDSA_N / 8; i++)
-  {
-    f = _mm256_load_si256((const __m256i *)&a[8 * i]);
-    h = _mm256_load_si256((const __m256i *)&hint[8 * i]);
-
-    /* Reference:
-     * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
-     * - Our implementation of decompose() is slightly different from that in
-     *   @[REF_AVX2]. See poly_decompose_88_avx2.c for more information.
-     */
-    /* f1, f2 = decompose(f) */
-    f1 = _mm256_add_epi32(f, off);
-    f1 = _mm256_srli_epi32(f1, 7);
-    f1 = _mm256_mulhi_epu16(f1, v);
-    f1 = _mm256_mulhrs_epi16(f1, shift);
-    t = _mm256_cmpgt_epi32(f, q_bound);
-    f0 = _mm256_mullo_epi32(f1, alpha);
-    f0 = _mm256_sub_epi32(f, f0);
-    f1 = _mm256_andnot_si256(t, f1);
-    f0 = _mm256_add_epi32(f0, t);
-
-    /* Reference: The reference avx2 implementation checks a0 >= 0, which is
-     * different from the specification and the reference C implementation. We
-     * follow the specification and check a0 > 0.
-     */
-    /* t = (f0 > 0) ? h : -h */
-    f0 = _mm256_cmpgt_epi32(f0, zero);
-    t = MLD_MM256_BLENDV_EPI32(h, zero, f0);
-    t = _mm256_slli_epi32(t, 1);
-    h = _mm256_sub_epi32(h, t);
-
-    /* f1 = (f1 + t) % 44 */
-    f1 = _mm256_add_epi32(f1, h);
-    f1 = MLD_MM256_BLENDV_EPI32(f1, max, f1);
-    f = _mm256_cmpgt_epi32(f1, max);
-    f1 = MLD_MM256_BLENDV_EPI32(f1, zero, f);
-
-    _mm256_store_si256((__m256i *)&b[8 * i], f1);
-  }
-}
-
-#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
-         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
-         44) */
-
-MLD_EMPTY_CU(avx2_poly_use_hint_88)
-
-#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                             \
-          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                               \
-          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
-          44)) */
-
-/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
- * Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef MLD_MM256_BLENDV_EPI32
diff --git a/test/bench/bench_components_mldsa.c b/test/bench/bench_components_mldsa.c
index 6cf8627e5..b2c25e2da 100644
--- a/test/bench/bench_components_mldsa.c
+++ b/test/bench/bench_components_mldsa.c
@@ -61,6 +61,7 @@ static int cmp_uint64_t(const void *a, const void *b)
 static int bench(void)
 {
   MLD_ALIGN int32_t data0[256];
+  MLD_ALIGN int32_t data1[256];
   MLD_ALIGN mld_poly poly_out;
   MLD_ALIGN mld_polyvecl polyvecl_a, polyvecl_b;
   MLD_ALIGN mld_polyveck polyveck_out;
@@ -86,6 +87,9 @@ static int bench(void)
 
   BENCH("poly_caddq", mld_poly_caddq((mld_poly *)data0));
 
+  BENCH("poly_use_hint",
+        mld_poly_use_hint(&poly_out, (mld_poly *)data0, (mld_poly *)data1));
+
   return 0;
 }