diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 1b284974c..56f095a3e 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -223,8 +223,8 @@ source code and documentation. - [dev/x86_64/src/poly_chknorm_avx2.c](dev/x86_64/src/poly_chknorm_avx2.c) - [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c) - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c) - - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c) - - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c) + - [dev/x86_64/src/poly_use_hint_32_avx2.S](dev/x86_64/src/poly_use_hint_32_avx2.S) + - [dev/x86_64/src/poly_use_hint_88_avx2.S](dev/x86_64/src/poly_use_hint_88_avx2.S) - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c) - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c) - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c) @@ -241,8 +241,8 @@ source code and documentation. - [mldsa/src/native/x86_64/src/poly_chknorm_avx2.c](mldsa/src/native/x86_64/src/poly_chknorm_avx2.c) - [mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c) - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c) - - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c) - - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c) + - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S) + - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S) - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c) - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c) - [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c) diff --git a/dev/x86_64/src/poly_use_hint_32_avx2.S b/dev/x86_64/src/poly_use_hint_32_avx2.S new file mode 100644 index 000000000..cd7939c8d --- /dev/null +++ b/dev/x86_64/src/poly_use_hint_32_avx2.S @@ -0,0 +1,139 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_poly_use_hint + * + * Description: Use hint polynomial to correct the high bits of a polynomial. + * + * Arguments: - mld_poly *b: pointer to output polynomial with corrected high + * bits + * - const mld_poly *a: pointer to input polynomial + * - const mld_poly *hint: pointer to input hint polynomial + **************************************************/ + + + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + + +/* simpasm: header-end */ + +/* Reference: + * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop. + * - Our implementation of decompose() is slightly different from that in + * @[REF_AVX2]. See poly_decompose_32_avx2 for more information. + */ + +// a aliased with a0 +.macro decompose32_avx2 a1, a, temp1, temp2, temp3 +/* a1, a0 = decompose(a) + * See poly_decompose_32_avx2 for more information. */ +vpaddd \a, %ymm5, \temp1 +vpsrld $7, \temp1, \temp1 +vpmulhuw %ymm8, \temp1, \temp1 +vpmulhrsw %ymm7, \temp1, \temp1 +/* Check for wrap-around; set a1 = 0 if required */ +vpcmpgtd %ymm4, \a, \temp2 +vpandn \temp1, \temp2, \a1 +/* Compute remainder a0 */ +vpslld $10, \temp1, \temp3 +vpsubd \temp1, \temp3, \temp1 +vpslld $9, \temp1, \temp1 +vpsubd \temp1, \a, \a +/* If wrap-around is required, a0 -= 1 */ +vpaddd \temp2, \a, \a +.endm + +/* Reference: The reference avx2 implementation checks a0 >= 0, which is + * different from the specification and the reference C implementation. We + * follow the specification and check a0 > 0. + */ + +// a aliased with delta +.macro use_hint32_avx2 b, a, h, a1, temp1, temp2, temp3 +decompose32_avx2 \a1, \a, \temp1, \temp2, \temp3 + +/* h = (a0 > 0) ? h : -h */ +vpcmpgtd %ymm6, \a, \a +vpandn \h, \a, \a +vpslld $1, \a, \a +vpsubd \a, \h, \h + +/* b = (a1 + h) % 16 */ +vpaddd \a1, \h, \b +vpand %ymm3, \b, \b +.endm + +.text +.balign 16 +.global MLD_ASM_NAMESPACE(mld_poly_use_hint_32_avx2) +MLD_ASM_FN_SYMBOL(mld_poly_use_hint_32_avx2) + +// Initialize constants +movl $127, %ecx + +/* check-magic: 1025 == floor(2^22 / 4092) */ +movl $1025, %r8d +vmovd %r8d, %xmm8 +vpbroadcastd %xmm8, %ymm8 + +xorl %eax, %eax +vpxor %xmm6, %xmm6, %xmm6 +vmovd %ecx, %xmm5 + +/* 87 * ((Q-1) / 32), wrap-around threshold */ +movl $22784256, %ecx + +/* round(x * 2^9 / 2^15) => round(x / 2^6), for f1 = round(f1''/ 2^6)*/ +movl $512, %r9d +vmovd %r9d, %xmm7 +vpbroadcastd %xmm7, %ymm7 + +vmovd %ecx, %xmm4 +movl $15, %ecx +vpbroadcastd %xmm5, %ymm5 +vmovd %ecx, %xmm3 +vpbroadcastd %xmm4, %ymm4 +vpbroadcastd %xmm3, %ymm3 + + +mld_poly_use_hint_32_avx2_loop: +vmovdqa (%rsi,%rax), %ymm0 +vmovdqa (%rdx,%rax), %ymm2 + +use_hint32_avx2 %ymm2, %ymm0, %ymm2, %ymm9, %ymm1, %ymm11, %ymm10 + +vmovdqa %ymm2, (%rdi,%rax) +addq $32, %rax +cmpq $1024, %rax +jne mld_poly_use_hint_32_avx2_loop +ret + +/* simpasm: footer-start */ + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 65 || MLD_CONFIG_PARAMETER_SET == 87) */ diff --git a/dev/x86_64/src/poly_use_hint_32_avx2.c b/dev/x86_64/src/poly_use_hint_32_avx2.c deleted file mode 100644 index 9a5c866bb..000000000 --- a/dev/x86_64/src/poly_use_hint_32_avx2.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -#define MLD_MM256_BLENDV_EPI32(a, b, mask) \ - _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ - _mm256_castsi256_ps(b), \ - _mm256_castsi256_ps(mask))) - -void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a, - const int32_t *hint) -{ - unsigned int i; - __m256i f, f0, f1, h, t; - const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 32)); - /* check-magic: 1025 == floor(2**22 / 4092) */ - const __m256i v = _mm256_set1_epi32(1025); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(512); - const __m256i mask = _mm256_set1_epi32(15); - const __m256i zero = _mm256_setzero_si256(); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((const __m256i *)&a[8 * i]); - h = _mm256_load_si256((const __m256i *)&hint[8 * i]); - - /* Reference: - * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop. - * - Our implementation of decompose() is slightly different from that in - * @[REF_AVX2]. See poly_decompose_32_avx2.c for more information. - */ - /* f1, f2 = decompose(f) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - f1 = _mm256_mulhi_epu16(f1, v); - f1 = _mm256_mulhrs_epi16(f1, shift); - t = _mm256_cmpgt_epi32(f, q_bound); - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - - /* Reference: The reference avx2 implementation checks a0 >= 0, which is - * different from the specification and the reference C implementation. We - * follow the specification and check a0 > 0. - */ - /* t = (f0 > 0) ? h : -h */ - f0 = _mm256_cmpgt_epi32(f0, zero); - t = MLD_MM256_BLENDV_EPI32(h, zero, f0); - t = _mm256_slli_epi32(t, 1); - h = _mm256_sub_epi32(h, t); - - /* f1 = (f1 + t) % 16 */ - f1 = _mm256_add_epi32(f1, h); - f1 = _mm256_and_si256(f1, mask); - - _mm256_store_si256((__m256i *)&b[8 * i], f1); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ - && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 65 || MLD_CONFIG_PARAMETER_SET == 87) */ - -MLD_EMPTY_CU(avx2_poly_use_hint_32) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87)) */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef MLD_MM256_BLENDV_EPI32 diff --git a/dev/x86_64/src/poly_use_hint_88_avx2.S b/dev/x86_64/src/poly_use_hint_88_avx2.S new file mode 100644 index 000000000..08bec8d09 --- /dev/null +++ b/dev/x86_64/src/poly_use_hint_88_avx2.S @@ -0,0 +1,142 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_poly_use_hint + * + * Description: Use hint polynomial to correct the high bits of a polynomial. + * + * Arguments: - mld_poly *b: pointer to output polynomial with corrected high + * bits + * - const mld_poly *a: pointer to input polynomial + * - const mld_poly *hint: pointer to input hint polynomial + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) +/* simpasm: header-end */ + +/* Reference: + * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop. + * - Our implementation of decompose() is slightly different from that in + * @[REF_AVX2]. See poly_decompose_88_avx2 for more information. + */ + +// a aliased with a0 +.macro decompose88_avx2 a1, a, temp1, temp2, temp3 +/* a1, a0 = decompose(a) + * See poly_decompose_88_avx2 for more information. */ +vpaddd \a, %ymm4, \temp1 +vpsrld $7, \temp1, \temp1 +vpmulhuw %ymm8, \temp1, \temp1 +vpmulhrsw %ymm7, \temp1, \temp1 +/* Check for wrap-around; set a1 = 0 if required */ +vpcmpgtd %ymm3, \a, \temp2 +vpandn \temp1, \temp2, \a1 +/* Compute remainder a0 */ +vpslld $1, \temp1, \temp3 +vpaddd \temp1, \temp3, \temp3 +vpslld $5, \temp3, \temp1 +vpsubd \temp3, \temp1, \temp1 +vpslld $11, \temp1, \temp1 +vpsubd \temp1, \a, \a +/* If wrap-around is required, a0 -= 1 */ +vpaddd \temp2, \a, \a +.endm + +/* Reference: The reference avx2 implementation checks a0 >= 0, which is + * different from the specification and the reference C implementation. We + * follow the specification and check a0 > 0. + */ + +// a aliased with delta +.macro use_hint88_avx2 b, a, h, a1, temp1, temp2, temp3 +decompose88_avx2 \a1, \a, \temp1, \temp2, \temp3 + +/* h = (a0 > 0) ? h : -h */ +vpcmpgtd %ymm5, \a, \a +vpandn \h, \a, \a +vpslld $1, \a, \a +vpsubd \a, \h, \a + +/* b = (a1 + h) % 44 */ +vpaddd \a1, \a, \b +vblendvps \b, %ymm6, \b, \b +vpcmpgtd %ymm6, \b, \h +vpandn \b, \h, \b +.endm + +.text +.balign 16 +.global MLD_ASM_NAMESPACE(mld_poly_use_hint_88_avx2) +MLD_ASM_FN_SYMBOL(mld_poly_use_hint_88_avx2) + +// Initialize constants +movl $127, %ecx +xorl %eax, %eax +vpxor %xmm5, %xmm5, %xmm5 + +/* check-magic: 11275 == floor(2^24 / 1488) */ +movl $11275, %r8d +vmovd %r8d, %xmm8 +vpbroadcastd %xmm8, %ymm8 + +vmovd %ecx, %xmm4 + +/* 87 * ((Q-1) / 88), wrap-around threshold */ +movl $8285184, %ecx + +/* round(x * 2^7 / 2^15) => round(x / 2^8), for f1 = round(f1''/ 2^8)*/ +movl $128, %r9d +vmovd %r9d, %xmm7 +vpbroadcastd %xmm7, %ymm7 + + /* max a1 value */ +movl $43, %r10d +vmovd %r10d, %xmm6 +vpbroadcastd %xmm6, %ymm6 + +vmovd %ecx, %xmm3 +vpbroadcastd %xmm4, %ymm4 +vpbroadcastd %xmm3, %ymm3 + +mld_poly_use_hint_88_avx2_loop: +vmovdqa (%rsi,%rax), %ymm0 +vmovdqa (%rdx,%rax), %ymm1 + +use_hint88_avx2 %ymm0, %ymm0, %ymm1, %ymm9, %ymm10, %ymm11, %ymm12 + +vmovdqa %ymm0, (%rdi,%rax) +addq $32, %rax +cmpq $1024, %rax +jne mld_poly_use_hint_88_avx2_loop + +ret + +/* simpasm: footer-start */ + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 44) */ + diff --git a/dev/x86_64/src/poly_use_hint_88_avx2.c b/dev/x86_64/src/poly_use_hint_88_avx2.c deleted file mode 100644 index 32fa2e6c6..000000000 --- a/dev/x86_64/src/poly_use_hint_88_avx2.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - MLD_CONFIG_PARAMETER_SET == 44) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -#define MLD_MM256_BLENDV_EPI32(a, b, mask) \ - _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ - _mm256_castsi256_ps(b), \ - _mm256_castsi256_ps(mask))) - -void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a, - const int32_t *hint) -{ - unsigned int i; - __m256i f, f0, f1, h, t; - const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 88)); - /* check-magic: 11275 == floor(2**24 / 1488) */ - const __m256i v = _mm256_set1_epi32(11275); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 88)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(128); - const __m256i max = _mm256_set1_epi32(43); - const __m256i zero = _mm256_setzero_si256(); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((const __m256i *)&a[8 * i]); - h = _mm256_load_si256((const __m256i *)&hint[8 * i]); - - /* Reference: - * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop. - * - Our implementation of decompose() is slightly different from that in - * @[REF_AVX2]. See poly_decompose_88_avx2.c for more information. - */ - /* f1, f2 = decompose(f) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - f1 = _mm256_mulhi_epu16(f1, v); - f1 = _mm256_mulhrs_epi16(f1, shift); - t = _mm256_cmpgt_epi32(f, q_bound); - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - - /* Reference: The reference avx2 implementation checks a0 >= 0, which is - * different from the specification and the reference C implementation. We - * follow the specification and check a0 > 0. - */ - /* t = (f0 > 0) ? h : -h */ - f0 = _mm256_cmpgt_epi32(f0, zero); - t = MLD_MM256_BLENDV_EPI32(h, zero, f0); - t = _mm256_slli_epi32(t, 1); - h = _mm256_sub_epi32(h, t); - - /* f1 = (f1 + t) % 44 */ - f1 = _mm256_add_epi32(f1, h); - f1 = MLD_MM256_BLENDV_EPI32(f1, max, f1); - f = _mm256_cmpgt_epi32(f1, max); - f1 = MLD_MM256_BLENDV_EPI32(f1, zero, f); - - _mm256_store_si256((__m256i *)&b[8 * i], f1); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ - && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44) */ - -MLD_EMPTY_CU(avx2_poly_use_hint_88) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44)) */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef MLD_MM256_BLENDV_EPI32 diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 9ad43f785..735ff937a 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -85,8 +85,6 @@ #include "src/native/x86_64/src/poly_chknorm_avx2.c" #include "src/native/x86_64/src/poly_decompose_32_avx2.c" #include "src/native/x86_64/src/poly_decompose_88_avx2.c" -#include "src/native/x86_64/src/poly_use_hint_32_avx2.c" -#include "src/native/x86_64/src/poly_use_hint_88_avx2.c" #include "src/native/x86_64/src/polyz_unpack_17_avx2.c" #include "src/native/x86_64/src/polyz_unpack_19_avx2.c" #include "src/native/x86_64/src/rej_uniform_avx2.c" diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index f0c54203f..e427ce701 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -87,6 +87,8 @@ #include "src/native/x86_64/src/pointwise_acc_l5.S" #include "src/native/x86_64/src/pointwise_acc_l7.S" #include "src/native/x86_64/src/poly_caddq_avx2.S" +#include "src/native/x86_64/src/poly_use_hint_32_avx2.S" +#include "src/native/x86_64/src/poly_use_hint_88_avx2.S" #endif /* MLD_SYS_X86_64 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S b/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S new file mode 100644 index 000000000..626c13d6b --- /dev/null +++ b/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.S @@ -0,0 +1,108 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_poly_use_hint + * + * Description: Use hint polynomial to correct the high bits of a polynomial. + * + * Arguments: - mld_poly *b: pointer to output polynomial with corrected high + * bits + * - const mld_poly *a: pointer to input polynomial + * - const mld_poly *hint: pointer to input hint polynomial + **************************************************/ + + + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_use_hint_32_avx2.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(mld_poly_use_hint_32_avx2) +MLD_ASM_FN_SYMBOL(mld_poly_use_hint_32_avx2) + + .cfi_startproc + movl $0x7f, %ecx + movl $0x401, %r8d # imm = 0x401 + vmovd %r8d, %xmm8 + vpbroadcastd %xmm8, %ymm8 + xorl %eax, %eax + vpxor %xmm6, %xmm6, %xmm6 + vmovd %ecx, %xmm5 + movl $0x15ba900, %ecx # imm = 0x15BA900 + movl $0x200, %r9d # imm = 0x200 + vmovd %r9d, %xmm7 + vpbroadcastd %xmm7, %ymm7 + vmovd %ecx, %xmm4 + movl $0xf, %ecx + vpbroadcastd %xmm5, %ymm5 + vmovd %ecx, %xmm3 + vpbroadcastd %xmm4, %ymm4 + vpbroadcastd %xmm3, %ymm3 + +Lmld_poly_use_hint_32_avx2_loop: + vmovdqa (%rsi,%rax), %ymm0 + vmovdqa (%rdx,%rax), %ymm2 + vpaddd %ymm0, %ymm5, %ymm1 + vpsrld $0x7, %ymm1, %ymm1 + vpmulhuw %ymm8, %ymm1, %ymm1 + vpmulhrsw %ymm7, %ymm1, %ymm1 + vpcmpgtd %ymm4, %ymm0, %ymm11 + vpandn %ymm1, %ymm11, %ymm9 + vpslld $0xa, %ymm1, %ymm10 + vpsubd %ymm1, %ymm10, %ymm1 + vpslld $0x9, %ymm1, %ymm1 + vpsubd %ymm1, %ymm0, %ymm0 + vpaddd %ymm11, %ymm0, %ymm0 + vpcmpgtd %ymm6, %ymm0, %ymm0 + vpandn %ymm2, %ymm0, %ymm0 + vpslld $0x1, %ymm0, %ymm0 + vpsubd %ymm0, %ymm2, %ymm2 + vpaddd %ymm9, %ymm2, %ymm2 + vpand %ymm3, %ymm2, %ymm2 + vmovdqa %ymm2, (%rdi,%rax) + addq $0x20, %rax + cmpq $0x400, %rax # imm = 0x400 + jne Lmld_poly_use_hint_32_avx2_loop + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(mld_poly_use_hint_32_avx2) + + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 65 || MLD_CONFIG_PARAMETER_SET == 87) */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c b/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c deleted file mode 100644 index 9a5c866bb..000000000 --- a/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -#define MLD_MM256_BLENDV_EPI32(a, b, mask) \ - _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ - _mm256_castsi256_ps(b), \ - _mm256_castsi256_ps(mask))) - -void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a, - const int32_t *hint) -{ - unsigned int i; - __m256i f, f0, f1, h, t; - const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 32)); - /* check-magic: 1025 == floor(2**22 / 4092) */ - const __m256i v = _mm256_set1_epi32(1025); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(512); - const __m256i mask = _mm256_set1_epi32(15); - const __m256i zero = _mm256_setzero_si256(); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((const __m256i *)&a[8 * i]); - h = _mm256_load_si256((const __m256i *)&hint[8 * i]); - - /* Reference: - * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop. - * - Our implementation of decompose() is slightly different from that in - * @[REF_AVX2]. See poly_decompose_32_avx2.c for more information. - */ - /* f1, f2 = decompose(f) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - f1 = _mm256_mulhi_epu16(f1, v); - f1 = _mm256_mulhrs_epi16(f1, shift); - t = _mm256_cmpgt_epi32(f, q_bound); - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - - /* Reference: The reference avx2 implementation checks a0 >= 0, which is - * different from the specification and the reference C implementation. We - * follow the specification and check a0 > 0. - */ - /* t = (f0 > 0) ? h : -h */ - f0 = _mm256_cmpgt_epi32(f0, zero); - t = MLD_MM256_BLENDV_EPI32(h, zero, f0); - t = _mm256_slli_epi32(t, 1); - h = _mm256_sub_epi32(h, t); - - /* f1 = (f1 + t) % 16 */ - f1 = _mm256_add_epi32(f1, h); - f1 = _mm256_and_si256(f1, mask); - - _mm256_store_si256((__m256i *)&b[8 * i], f1); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ - && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 65 || MLD_CONFIG_PARAMETER_SET == 87) */ - -MLD_EMPTY_CU(avx2_poly_use_hint_32) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87)) */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef MLD_MM256_BLENDV_EPI32 diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S b/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S new file mode 100644 index 000000000..407233d7b --- /dev/null +++ b/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.S @@ -0,0 +1,109 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_poly_use_hint + * + * Description: Use hint polynomial to correct the high bits of a polynomial. + * + * Arguments: - mld_poly *b: pointer to output polynomial with corrected high + * bits + * - const mld_poly *a: pointer to input polynomial + * - const mld_poly *hint: pointer to input hint polynomial + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_use_hint_88_avx2.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(mld_poly_use_hint_88_avx2) +MLD_ASM_FN_SYMBOL(mld_poly_use_hint_88_avx2) + + .cfi_startproc + movl $0x7f, %ecx + xorl %eax, %eax + vpxor %xmm5, %xmm5, %xmm5 + movl $0x2c0b, %r8d # imm = 0x2C0B + vmovd %r8d, %xmm8 + vpbroadcastd %xmm8, %ymm8 + vmovd %ecx, %xmm4 + movl $0x7e6c00, %ecx # imm = 0x7E6C00 + movl $0x80, %r9d + vmovd %r9d, %xmm7 + vpbroadcastd %xmm7, %ymm7 + movl $0x2b, %r10d + vmovd %r10d, %xmm6 + vpbroadcastd %xmm6, %ymm6 + vmovd %ecx, %xmm3 + vpbroadcastd %xmm4, %ymm4 + vpbroadcastd %xmm3, %ymm3 + +Lmld_poly_use_hint_88_avx2_loop: + vmovdqa (%rsi,%rax), %ymm0 + vmovdqa (%rdx,%rax), %ymm1 + vpaddd %ymm0, %ymm4, %ymm10 + vpsrld $0x7, %ymm10, %ymm10 + vpmulhuw %ymm8, %ymm10, %ymm10 + vpmulhrsw %ymm7, %ymm10, %ymm10 + vpcmpgtd %ymm3, %ymm0, %ymm11 + vpandn %ymm10, %ymm11, %ymm9 + vpslld $0x1, %ymm10, %ymm12 + vpaddd %ymm10, %ymm12, %ymm12 + vpslld $0x5, %ymm12, %ymm10 + vpsubd %ymm12, %ymm10, %ymm10 + vpslld $0xb, %ymm10, %ymm10 + vpsubd %ymm10, %ymm0, %ymm0 + vpaddd %ymm11, %ymm0, %ymm0 + vpcmpgtd %ymm5, %ymm0, %ymm0 + vpandn %ymm1, %ymm0, %ymm0 + vpslld $0x1, %ymm0, %ymm0 + vpsubd %ymm0, %ymm1, %ymm0 + vpaddd %ymm9, %ymm0, %ymm0 + vblendvps %ymm0, %ymm6, %ymm0, %ymm0 + vpcmpgtd %ymm6, %ymm0, %ymm1 + vpandn %ymm0, %ymm1, %ymm0 + vmovdqa %ymm0, (%rdi,%rax) + addq $0x20, %rax + cmpq $0x400, %rax # imm = 0x400 + jne Lmld_poly_use_hint_88_avx2_loop + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(mld_poly_use_hint_88_avx2) + + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ + 44) */ + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c b/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c deleted file mode 100644 index 32fa2e6c6..000000000 --- a/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - MLD_CONFIG_PARAMETER_SET == 44) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -#define MLD_MM256_BLENDV_EPI32(a, b, mask) \ - _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ - _mm256_castsi256_ps(b), \ - _mm256_castsi256_ps(mask))) - -void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a, - const int32_t *hint) -{ - unsigned int i; - __m256i f, f0, f1, h, t; - const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 88)); - /* check-magic: 11275 == floor(2**24 / 1488) */ - const __m256i v = _mm256_set1_epi32(11275); - const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 88)); - const __m256i off = _mm256_set1_epi32(127); - const __m256i shift = _mm256_set1_epi32(128); - const __m256i max = _mm256_set1_epi32(43); - const __m256i zero = _mm256_setzero_si256(); - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256((const __m256i *)&a[8 * i]); - h = _mm256_load_si256((const __m256i *)&hint[8 * i]); - - /* Reference: - * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop. - * - Our implementation of decompose() is slightly different from that in - * @[REF_AVX2]. See poly_decompose_88_avx2.c for more information. - */ - /* f1, f2 = decompose(f) */ - f1 = _mm256_add_epi32(f, off); - f1 = _mm256_srli_epi32(f1, 7); - f1 = _mm256_mulhi_epu16(f1, v); - f1 = _mm256_mulhrs_epi16(f1, shift); - t = _mm256_cmpgt_epi32(f, q_bound); - f0 = _mm256_mullo_epi32(f1, alpha); - f0 = _mm256_sub_epi32(f, f0); - f1 = _mm256_andnot_si256(t, f1); - f0 = _mm256_add_epi32(f0, t); - - /* Reference: The reference avx2 implementation checks a0 >= 0, which is - * different from the specification and the reference C implementation. We - * follow the specification and check a0 > 0. - */ - /* t = (f0 > 0) ? h : -h */ - f0 = _mm256_cmpgt_epi32(f0, zero); - t = MLD_MM256_BLENDV_EPI32(h, zero, f0); - t = _mm256_slli_epi32(t, 1); - h = _mm256_sub_epi32(h, t); - - /* f1 = (f1 + t) % 44 */ - f1 = _mm256_add_epi32(f1, h); - f1 = MLD_MM256_BLENDV_EPI32(f1, max, f1); - f = _mm256_cmpgt_epi32(f1, max); - f1 = MLD_MM256_BLENDV_EPI32(f1, zero, f); - - _mm256_store_si256((__m256i *)&b[8 * i], f1); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ - && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44) */ - -MLD_EMPTY_CU(avx2_poly_use_hint_88) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44)) */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef MLD_MM256_BLENDV_EPI32 diff --git a/test/bench/bench_components_mldsa.c b/test/bench/bench_components_mldsa.c index 6cf8627e5..b2c25e2da 100644 --- a/test/bench/bench_components_mldsa.c +++ b/test/bench/bench_components_mldsa.c @@ -61,6 +61,7 @@ static int cmp_uint64_t(const void *a, const void *b) static int bench(void) { MLD_ALIGN int32_t data0[256]; + MLD_ALIGN int32_t data1[256]; MLD_ALIGN mld_poly poly_out; MLD_ALIGN mld_polyvecl polyvecl_a, polyvecl_b; MLD_ALIGN mld_polyveck polyveck_out; @@ -86,6 +87,9 @@ static int bench(void) BENCH("poly_caddq", mld_poly_caddq((mld_poly *)data0)); + BENCH("poly_use_hint", + mld_poly_use_hint(&poly_out, (mld_poly *)data0, (mld_poly *)data1)); + return 0; }