diff --git a/.github/workflows/os-check.yml b/.github/workflows/os-check.yml index a6b9b945940..f5bf407efad 100644 --- a/.github/workflows/os-check.yml +++ b/.github/workflows/os-check.yml @@ -542,6 +542,12 @@ jobs: fail-fast: false matrix: arch: [ x64, Win32, ARM64 ] + asm: [ false ] + include: + # Intel assembly build (x64 only): assembles the crypto .asm files + # and enables the matching USE_INTEL_SPEEDUP code paths. + - arch: x64 + asm: true # This should be a safe limit for the tests to run. timeout-minutes: 6 env: @@ -566,7 +572,7 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} # Add additional options to the MSBuild command line here (like platform or verbosity level). # See https://docs.microsoft.com/visualstudio/msbuild/msbuild-command-line-reference - run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{matrix.arch}} /p:Configuration=${{env.BUILD_CONFIGURATION}} ${{env.SOLUTION_FILE_PATH}} + run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{matrix.arch}} /p:Configuration=${{env.BUILD_CONFIGURATION}} /p:WolfSSLIntelAsm=${{matrix.asm}} ${{env.SOLUTION_FILE_PATH}} - if: ${{ matrix.arch != 'ARM64' }} name: Run Test diff --git a/.github/workflows/win-csharp-test.yml b/.github/workflows/win-csharp-test.yml index d37637e566e..001ac0fd96a 100644 --- a/.github/workflows/win-csharp-test.yml +++ b/.github/workflows/win-csharp-test.yml @@ -13,6 +13,13 @@ jobs: if: ${{ (github.repository_owner == 'wolfssl') && (github.event_name != 'pull_request' || github.event.pull_request.draft == false) }} runs-on: windows-latest + strategy: + fail-fast: false + matrix: + # false: pure C. true: assemble the crypto .asm files and enable the + # USE_INTEL_SPEEDUP code paths (x64). + asm: [ false, true ] + # This should be a safe limit for the tests to run. timeout-minutes: 6 @@ -48,7 +55,7 @@ jobs: working-directory: ${{env.GITHUB_WORKSPACE}} # Add additional options to the MSBuild command line here (like platform or verbosity level). # See https://docs.microsoft.com/visualstudio/msbuild/msbuild-command-line-reference - run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{env.BUILD_PLATFORM}} /p:Configuration=${{env.BUILD_CONFIGURATION}} ${{env.SOLUTION_FILE_PATH}} + run: msbuild /m /p:PlatformToolset=v142 /p:Platform=${{env.BUILD_PLATFORM}} /p:Configuration=${{env.BUILD_CONFIGURATION}} /p:WolfSSLIntelAsm=${{matrix.asm}} ${{env.SOLUTION_FILE_PATH}} - name: Run wolfCrypt test working-directory: ${{env.GITHUB_WORKSPACE}}wolfssl\wrapper\CSharp\Debug\x64\ diff --git a/examples/client/client.vcxproj b/examples/client/client.vcxproj index 0843627d584..d6a21467c1c 100644 --- a/examples/client/client.vcxproj +++ b/examples/client/client.vcxproj @@ -478,6 +478,14 @@ + + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/examples/echoclient/echoclient.vcxproj b/examples/echoclient/echoclient.vcxproj index 68eb81b1d5c..233b1cdbd28 100644 --- a/examples/echoclient/echoclient.vcxproj +++ b/examples/echoclient/echoclient.vcxproj @@ -478,6 +478,14 @@ + + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/examples/echoserver/echoserver.vcxproj b/examples/echoserver/echoserver.vcxproj index 68c4f16800a..29f440f56ce 100644 --- a/examples/echoserver/echoserver.vcxproj +++ b/examples/echoserver/echoserver.vcxproj @@ -478,6 +478,14 @@ + + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/examples/server/server.vcxproj b/examples/server/server.vcxproj index 3695fc1eb6b..9343976a6d7 100644 --- a/examples/server/server.vcxproj +++ b/examples/server/server.vcxproj @@ -478,6 +478,14 @@ + + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/sslSniffer/sslSniffer.vcxproj b/sslSniffer/sslSniffer.vcxproj index 88bbc963fe4..4925b99b832 100644 --- a/sslSniffer/sslSniffer.vcxproj +++ b/sslSniffer/sslSniffer.vcxproj @@ -256,6 +256,14 @@ false + + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj b/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj index 8d4cb32aca1..f98f33cc1fd 100644 --- a/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj +++ b/sslSniffer/sslSnifferTest/sslSniffTest.vcxproj @@ -1,263 +1,271 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Debug - ARM64 - - - Release - Win32 - - - Release - x64 - - - Release - ARM64 - - - - {8C89E16E-9C36-45EF-A491-F4EBD4A8D8F1} - sslSniffTest - Win32Proj - 10.0 - - - - Application - v141 - Unicode - true - - - Application - v141 - Unicode - true - - - Application - v141 - Unicode - true - - - Application - v141 - Unicode - - - Application - v141 - Unicode - - - Application - v141 - Unicode - - - - - - - - - - - - - - - - - - - - - - - - - <_ProjectFileVersion>15.0.28307.799 - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - true - snifftest - - - true - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - $(SolutionDir)$(Configuration)\$(Platform)\ - snifftest - - - true - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - $(SolutionDir)$(Configuration)\$(Platform)\ - snifftest - - - $(SolutionDir)$(Configuration)\$(Platform)\ - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - false - snifftest - - - false - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - $(SolutionDir)$(Configuration)\$(Platform)\ - snifftest - - - false - $(Configuration)\$(Platform)\$(ProjectName)_obj\ - $(SolutionDir)$(Configuration)\$(Platform)\ - snifftest - - - - Disabled - ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) - WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - true - EnableFastChecks - MultiThreadedDebugDLL - - Level3 - EditAndContinue - - - wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) - ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) - true - Console - MachineX86 - - - - - Disabled - ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) - WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - ProgramDatabase - - - wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) - ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) - true - Console - - - - - Disabled - ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) - WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - ProgramDatabase - - - wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) - ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) - true - Console - - - - - MaxSpeed - true - ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) - WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - Level3 - ProgramDatabase - - - wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) - ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) - true - Console - true - true - MachineX86 - - - - - MaxSpeed - true - ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) - WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) - ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) - true - Console - true - true - - - - - MaxSpeed - true - ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) - WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - MultiThreadedDLL - true - - - Level3 - ProgramDatabase - - - wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) - ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) - true - Console - true - true - - - - - - - - - - - + + + + + Debug + Win32 + + + Debug + x64 + + + Debug + ARM64 + + + Release + Win32 + + + Release + x64 + + + Release + ARM64 + + + + {8C89E16E-9C36-45EF-A491-F4EBD4A8D8F1} + sslSniffTest + Win32Proj + 10.0 + + + + Application + v141 + Unicode + true + + + Application + v141 + Unicode + true + + + Application + v141 + Unicode + true + + + Application + v141 + Unicode + + + Application + v141 + Unicode + + + Application + v141 + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + <_ProjectFileVersion>15.0.28307.799 + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + true + snifftest + + + true + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + $(SolutionDir)$(Configuration)\$(Platform)\ + snifftest + + + true + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + $(SolutionDir)$(Configuration)\$(Platform)\ + snifftest + + + $(SolutionDir)$(Configuration)\$(Platform)\ + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + false + snifftest + + + false + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + $(SolutionDir)$(Configuration)\$(Platform)\ + snifftest + + + false + $(Configuration)\$(Platform)\$(ProjectName)_obj\ + $(SolutionDir)$(Configuration)\$(Platform)\ + snifftest + + + + Disabled + ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) + WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + EnableFastChecks + MultiThreadedDebugDLL + + Level3 + EditAndContinue + + + wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) + ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) + true + Console + MachineX86 + + + + + Disabled + ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) + WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) + ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) + true + Console + + + + + Disabled + ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) + WIN32;WOLFSSL_USER_SETTINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + EnableFastChecks + MultiThreadedDebugDLL + + + Level3 + ProgramDatabase + + + wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) + ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) + true + Console + + + + + MaxSpeed + true + ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) + WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + Level3 + ProgramDatabase + + + wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) + ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) + true + Console + true + true + MachineX86 + + + + + MaxSpeed + true + ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) + WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) + ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) + true + Console + true + true + + + + + MaxSpeed + true + ../../../WpdPack/Include;../..;../../IDE/WIN;%(AdditionalIncludeDirectories) + WIN32;WOLFSSL_USER_SETTINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + MultiThreadedDLL + true + + + Level3 + ProgramDatabase + + + wpcap.lib;Packet.lib;sslSniffer.lib;ws2_32.lib;%(AdditionalDependencies) + ../../../WpdPack/Lib/x64;$(SolutionDir)$(Configuration)\$(Platform)\;%(AdditionalLibraryDirectories) + true + Console + true + true + + + + + + + + + + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + + + + \ No newline at end of file diff --git a/testsuite/testsuite.vcxproj b/testsuite/testsuite.vcxproj index baa2760f72c..8bc4242f0aa 100644 --- a/testsuite/testsuite.vcxproj +++ b/testsuite/testsuite.vcxproj @@ -484,6 +484,14 @@ + + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index 34f68476310..d7947b63a0d 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -43,14 +43,12 @@ ENDIF _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_aesni_rev8 QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_GCM_generate_m0_aesni_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_GCM_generate_m0_aesni_rev8 QWORD L_GCM_generate_m0_aesni_rev8 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_aesni_mod2_128 QWORD \ - 0000000000000000h, 0e100000000000000h +L_GCM_generate_m0_aesni_mod2_128 QWORD 0000000000000000h, 0e100000000000000h ptr_L_GCM_generate_m0_aesni_mod2_128 QWORD L_GCM_generate_m0_aesni_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -298,68 +296,57 @@ GCM_generate_m0_aesni ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_one QWORD \ - 0000000000000000h, 0000000000000001h +L_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h ptr_L_aes_gcm_one QWORD L_aes_gcm_one _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_two QWORD \ - 0000000000000000h, 0000000000000002h +L_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h ptr_L_aes_gcm_two QWORD L_aes_gcm_two _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_three QWORD \ - 0000000000000000h, 0000000000000003h +L_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h ptr_L_aes_gcm_three QWORD L_aes_gcm_three _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_four QWORD \ - 0000000000000000h, 0000000000000004h +L_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h ptr_L_aes_gcm_four QWORD L_aes_gcm_four _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_five QWORD \ - 0000000000000000h, 0000000000000005h +L_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h ptr_L_aes_gcm_five QWORD L_aes_gcm_five _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_six QWORD \ - 0000000000000000h, 0000000000000006h +L_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h ptr_L_aes_gcm_six QWORD L_aes_gcm_six _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_seven QWORD \ - 0000000000000000h, 0000000000000007h +L_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h ptr_L_aes_gcm_seven QWORD L_aes_gcm_seven _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_eight QWORD \ - 0000000000000000h, 0000000000000008h +L_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h ptr_L_aes_gcm_eight QWORD L_aes_gcm_eight _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_bswap_epi64 QWORD \ - 0001020304050607h, 08090a0b0c0d0e0fh +L_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_bswap_mask QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_gcm_mod2_128 QWORD \ - 0000000000000001h, 0c200000000000000h +L_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -6472,14 +6459,12 @@ _TEXT ENDS IFDEF HAVE_INTEL_AVX1 _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_avx1_rev8 QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_GCM_generate_m0_avx1_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_GCM_generate_m0_avx1_rev8 QWORD L_GCM_generate_m0_avx1_rev8 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_avx1_mod2_128 QWORD \ - 0000000000000000h, 0e100000000000000h +L_GCM_generate_m0_avx1_mod2_128 QWORD 0000000000000000h, 0e100000000000000h ptr_L_GCM_generate_m0_avx1_mod2_128 QWORD L_GCM_generate_m0_avx1_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -6693,68 +6678,57 @@ GCM_generate_m0_avx1 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_one QWORD \ - 0000000000000000h, 0000000000000001h +L_avx1_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_two QWORD \ - 0000000000000000h, 0000000000000002h +L_avx1_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h ptr_L_avx1_aes_gcm_two QWORD L_avx1_aes_gcm_two _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_three QWORD \ - 0000000000000000h, 0000000000000003h +L_avx1_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h ptr_L_avx1_aes_gcm_three QWORD L_avx1_aes_gcm_three _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_four QWORD \ - 0000000000000000h, 0000000000000004h +L_avx1_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h ptr_L_avx1_aes_gcm_four QWORD L_avx1_aes_gcm_four _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_five QWORD \ - 0000000000000000h, 0000000000000005h +L_avx1_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h ptr_L_avx1_aes_gcm_five QWORD L_avx1_aes_gcm_five _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_six QWORD \ - 0000000000000000h, 0000000000000006h +L_avx1_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h ptr_L_avx1_aes_gcm_six QWORD L_avx1_aes_gcm_six _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_seven QWORD \ - 0000000000000000h, 0000000000000007h +L_avx1_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h ptr_L_avx1_aes_gcm_seven QWORD L_avx1_aes_gcm_seven _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_eight QWORD \ - 0000000000000000h, 0000000000000008h +L_avx1_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h ptr_L_avx1_aes_gcm_eight QWORD L_avx1_aes_gcm_eight _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_bswap_epi64 QWORD \ - 0001020304050607h, 08090a0b0c0d0e0fh +L_avx1_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh ptr_L_avx1_aes_gcm_bswap_epi64 QWORD L_avx1_aes_gcm_bswap_epi64 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_bswap_mask QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_avx1_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_avx1_aes_gcm_bswap_mask QWORD L_avx1_aes_gcm_bswap_mask _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_gcm_mod2_128 QWORD \ - 0000000000000001h, 0c200000000000000h +L_avx1_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h ptr_L_avx1_aes_gcm_mod2_128 QWORD L_avx1_aes_gcm_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -11933,14 +11907,12 @@ ENDIF IFDEF HAVE_INTEL_AVX2 _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_avx2_rev8 QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_GCM_generate_m0_avx2_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_GCM_generate_m0_avx2_rev8 QWORD L_GCM_generate_m0_avx2_rev8 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_avx2_mod2_128 QWORD \ - 0000000000000000h, 0e100000000000000h +L_GCM_generate_m0_avx2_mod2_128 QWORD 0000000000000000h, 0e100000000000000h ptr_L_GCM_generate_m0_avx2_mod2_128 QWORD L_GCM_generate_m0_avx2_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -12154,74 +12126,62 @@ GCM_generate_m0_avx2 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_one QWORD \ - 0000000000000000h, 0000000000000001h +L_avx2_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_two QWORD \ - 0000000000000000h, 0000000000000002h +L_avx2_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h ptr_L_avx2_aes_gcm_two QWORD L_avx2_aes_gcm_two _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_three QWORD \ - 0000000000000000h, 0000000000000003h +L_avx2_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h ptr_L_avx2_aes_gcm_three QWORD L_avx2_aes_gcm_three _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_four QWORD \ - 0000000000000000h, 0000000000000004h +L_avx2_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h ptr_L_avx2_aes_gcm_four QWORD L_avx2_aes_gcm_four _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_five QWORD \ - 0000000000000000h, 0000000000000005h +L_avx2_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h ptr_L_avx2_aes_gcm_five QWORD L_avx2_aes_gcm_five _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_six QWORD \ - 0000000000000000h, 0000000000000006h +L_avx2_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h ptr_L_avx2_aes_gcm_six QWORD L_avx2_aes_gcm_six _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_seven QWORD \ - 0000000000000000h, 0000000000000007h +L_avx2_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h ptr_L_avx2_aes_gcm_seven QWORD L_avx2_aes_gcm_seven _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_eight QWORD \ - 0000000000000000h, 0000000000000008h +L_avx2_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h ptr_L_avx2_aes_gcm_eight QWORD L_avx2_aes_gcm_eight _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_bswap_one QWORD \ - 0000000000000000h, 0100000000000000h +L_avx2_aes_gcm_bswap_one QWORD 0000000000000000h, 0100000000000000h ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_bswap_epi64 QWORD \ - 0001020304050607h, 08090a0b0c0d0e0fh +L_avx2_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh ptr_L_avx2_aes_gcm_bswap_epi64 QWORD L_avx2_aes_gcm_bswap_epi64 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_bswap_mask QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_avx2_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_avx2_aes_gcm_bswap_mask QWORD L_avx2_aes_gcm_bswap_mask _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx2_aes_gcm_mod2_128 QWORD \ - 0000000000000001h, 0c200000000000000h +L_avx2_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h ptr_L_avx2_aes_gcm_mod2_128 QWORD L_avx2_aes_gcm_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -16521,42 +16481,36 @@ ENDIF IFDEF HAVE_INTEL_VAES _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_inc_y0 QWORD \ - 0000000000000000h, 0000000000000000h, - 0000000000000000h, 0000000000000001h +L_vaes_aes_gcm_inc_y0 QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000001h ptr_L_vaes_aes_gcm_inc_y0 QWORD L_vaes_aes_gcm_inc_y0 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_inc_y1 QWORD \ - 0000000000000000h, 0000000000000002h, - 0000000000000000h, 0000000000000003h +L_vaes_aes_gcm_inc_y1 QWORD 0000000000000000h, 0000000000000002h + QWORD 0000000000000000h, 0000000000000003h ptr_L_vaes_aes_gcm_inc_y1 QWORD L_vaes_aes_gcm_inc_y1 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_inc_y2 QWORD \ - 0000000000000000h, 0000000000000004h, - 0000000000000000h, 0000000000000005h +L_vaes_aes_gcm_inc_y2 QWORD 0000000000000000h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000005h ptr_L_vaes_aes_gcm_inc_y2 QWORD L_vaes_aes_gcm_inc_y2 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_inc_y3 QWORD \ - 0000000000000000h, 0000000000000006h, - 0000000000000000h, 0000000000000007h +L_vaes_aes_gcm_inc_y3 QWORD 0000000000000000h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000007h ptr_L_vaes_aes_gcm_inc_y3 QWORD L_vaes_aes_gcm_inc_y3 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_vaes_rev8 QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_GCM_generate_m0_vaes_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_GCM_generate_m0_vaes_rev8 QWORD L_GCM_generate_m0_vaes_rev8 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_vaes_mod2_128 QWORD \ - 0000000000000000h, 0e100000000000000h +L_GCM_generate_m0_vaes_mod2_128 QWORD 0000000000000000h, 0e100000000000000h ptr_L_GCM_generate_m0_vaes_mod2_128 QWORD L_GCM_generate_m0_vaes_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -16770,68 +16724,57 @@ GCM_generate_m0_vaes ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_one QWORD \ - 0000000000000000h, 0000000000000001h +L_vaes_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h ptr_L_vaes_aes_gcm_one QWORD L_vaes_aes_gcm_one _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_two QWORD \ - 0000000000000000h, 0000000000000002h +L_vaes_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h ptr_L_vaes_aes_gcm_two QWORD L_vaes_aes_gcm_two _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_three QWORD \ - 0000000000000000h, 0000000000000003h +L_vaes_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h ptr_L_vaes_aes_gcm_three QWORD L_vaes_aes_gcm_three _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_four QWORD \ - 0000000000000000h, 0000000000000004h +L_vaes_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h ptr_L_vaes_aes_gcm_four QWORD L_vaes_aes_gcm_four _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_five QWORD \ - 0000000000000000h, 0000000000000005h +L_vaes_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h ptr_L_vaes_aes_gcm_five QWORD L_vaes_aes_gcm_five _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_six QWORD \ - 0000000000000000h, 0000000000000006h +L_vaes_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h ptr_L_vaes_aes_gcm_six QWORD L_vaes_aes_gcm_six _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_seven QWORD \ - 0000000000000000h, 0000000000000007h +L_vaes_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h ptr_L_vaes_aes_gcm_seven QWORD L_vaes_aes_gcm_seven _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_eight QWORD \ - 0000000000000000h, 0000000000000008h +L_vaes_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h ptr_L_vaes_aes_gcm_eight QWORD L_vaes_aes_gcm_eight _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_bswap_epi64 QWORD \ - 0001020304050607h, 08090a0b0c0d0e0fh +L_vaes_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh ptr_L_vaes_aes_gcm_bswap_epi64 QWORD L_vaes_aes_gcm_bswap_epi64 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_bswap_mask QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_vaes_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_vaes_aes_gcm_bswap_mask QWORD L_vaes_aes_gcm_bswap_mask _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_gcm_mod2_128 QWORD \ - 0000000000000001h, 0c200000000000000h +L_vaes_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h ptr_L_vaes_aes_gcm_mod2_128 QWORD L_vaes_aes_gcm_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -17587,7 +17530,7 @@ L_AES_GCM_encrypt_vaes_loop_256: lea rcx, QWORD PTR [rsi+rbx] mov QWORD PTR [rsp+544], rcx vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [rsp+512] + vbroadcasti128 ymm4, OWORD PTR [rsp+512] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -17599,81 +17542,81 @@ L_AES_GCM_encrypt_vaes_loop_256: vmovdqu xmm7, OWORD PTR [rsp+512] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [rsp+512], xmm7 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+16] + vbroadcasti128 ymm4, OWORD PTR [r15+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+32] + vbroadcasti128 ymm4, OWORD PTR [r15+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+48] + vbroadcasti128 ymm4, OWORD PTR [r15+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+64] + vbroadcasti128 ymm4, OWORD PTR [r15+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+80] + vbroadcasti128 ymm4, OWORD PTR [r15+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+96] + vbroadcasti128 ymm4, OWORD PTR [r15+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+112] + vbroadcasti128 ymm4, OWORD PTR [r15+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+128] + vbroadcasti128 ymm4, OWORD PTR [r15+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+144] + vbroadcasti128 ymm4, OWORD PTR [r15+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 11 - vbroadcasti128 ymm4, [r15+160] + vbroadcasti128 ymm4, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+176] + vbroadcasti128 ymm4, OWORD PTR [r15+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 13 - vbroadcasti128 ymm4, [r15+192] + vbroadcasti128 ymm4, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+208] + vbroadcasti128 ymm4, OWORD PTR [r15+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+224] + vbroadcasti128 ymm4, OWORD PTR [r15+224] L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -17694,7 +17637,7 @@ L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last: vpxor ymm3, ymm3, ymm5 vmovdqu YMMWORD PTR [rdx+96], ymm3 add ebx, 128 - vbroadcasti128 ymm4, [rsp+512] + vbroadcasti128 ymm4, OWORD PTR [rsp+512] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -17706,81 +17649,81 @@ L_AES_GCM_encrypt_vaes_p1_vaes_ctr8_last: vmovdqu xmm7, OWORD PTR [rsp+512] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [rsp+512], xmm7 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+16] + vbroadcasti128 ymm4, OWORD PTR [r15+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+32] + vbroadcasti128 ymm4, OWORD PTR [r15+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+48] + vbroadcasti128 ymm4, OWORD PTR [r15+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+64] + vbroadcasti128 ymm4, OWORD PTR [r15+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+80] + vbroadcasti128 ymm4, OWORD PTR [r15+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+96] + vbroadcasti128 ymm4, OWORD PTR [r15+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+112] + vbroadcasti128 ymm4, OWORD PTR [r15+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+128] + vbroadcasti128 ymm4, OWORD PTR [r15+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+144] + vbroadcasti128 ymm4, OWORD PTR [r15+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 11 - vbroadcasti128 ymm4, [r15+160] + vbroadcasti128 ymm4, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+176] + vbroadcasti128 ymm4, OWORD PTR [r15+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 13 - vbroadcasti128 ymm4, [r15+192] + vbroadcasti128 ymm4, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+208] + vbroadcasti128 ymm4, OWORD PTR [r15+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+224] + vbroadcasti128 ymm4, OWORD PTR [r15+224] L_AES_GCM_encrypt_vaes_p2_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -17914,7 +17857,7 @@ L_AES_GCM_encrypt_vaes_after_256: lea rcx, QWORD PTR [rsi+rbx] mov QWORD PTR [rsp+544], rcx vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [rsp+512] + vbroadcasti128 ymm4, OWORD PTR [rsp+512] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -17926,81 +17869,81 @@ L_AES_GCM_encrypt_vaes_after_256: vmovdqu xmm7, OWORD PTR [rsp+512] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [rsp+512], xmm7 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+16] + vbroadcasti128 ymm4, OWORD PTR [r15+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+32] + vbroadcasti128 ymm4, OWORD PTR [r15+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+48] + vbroadcasti128 ymm4, OWORD PTR [r15+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+64] + vbroadcasti128 ymm4, OWORD PTR [r15+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+80] + vbroadcasti128 ymm4, OWORD PTR [r15+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+96] + vbroadcasti128 ymm4, OWORD PTR [r15+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+112] + vbroadcasti128 ymm4, OWORD PTR [r15+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+128] + vbroadcasti128 ymm4, OWORD PTR [r15+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+144] + vbroadcasti128 ymm4, OWORD PTR [r15+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 11 - vbroadcasti128 ymm4, [r15+160] + vbroadcasti128 ymm4, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+176] + vbroadcasti128 ymm4, OWORD PTR [r15+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 13 - vbroadcasti128 ymm4, [r15+192] + vbroadcasti128 ymm4, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+208] + vbroadcasti128 ymm4, OWORD PTR [r15+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+224] + vbroadcasti128 ymm4, OWORD PTR [r15+224] L_AES_GCM_encrypt_vaes_8_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -19190,7 +19133,7 @@ L_AES_GCM_decrypt_vaes_loop_256: vextracti128 xmm0, ymm13, 1 vpxor xmm15, xmm13, xmm0 vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [rsp+512] + vbroadcasti128 ymm4, OWORD PTR [rsp+512] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -19202,81 +19145,81 @@ L_AES_GCM_decrypt_vaes_loop_256: vmovdqu xmm7, OWORD PTR [rsp+512] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [rsp+512], xmm7 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+16] + vbroadcasti128 ymm4, OWORD PTR [r15+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+32] + vbroadcasti128 ymm4, OWORD PTR [r15+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+48] + vbroadcasti128 ymm4, OWORD PTR [r15+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+64] + vbroadcasti128 ymm4, OWORD PTR [r15+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+80] + vbroadcasti128 ymm4, OWORD PTR [r15+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+96] + vbroadcasti128 ymm4, OWORD PTR [r15+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+112] + vbroadcasti128 ymm4, OWORD PTR [r15+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+128] + vbroadcasti128 ymm4, OWORD PTR [r15+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+144] + vbroadcasti128 ymm4, OWORD PTR [r15+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 11 - vbroadcasti128 ymm4, [r15+160] + vbroadcasti128 ymm4, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+176] + vbroadcasti128 ymm4, OWORD PTR [r15+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 13 - vbroadcasti128 ymm4, [r15+192] + vbroadcasti128 ymm4, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+208] + vbroadcasti128 ymm4, OWORD PTR [r15+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+224] + vbroadcasti128 ymm4, OWORD PTR [r15+224] L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -19297,7 +19240,7 @@ L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last: vpxor ymm3, ymm3, ymm5 vmovdqu YMMWORD PTR [rdx+96], ymm3 add ebx, 128 - vbroadcasti128 ymm4, [rsp+512] + vbroadcasti128 ymm4, OWORD PTR [rsp+512] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -19309,81 +19252,81 @@ L_AES_GCM_decrypt_vaes_l1_vaes_ctr8_last: vmovdqu xmm7, OWORD PTR [rsp+512] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [rsp+512], xmm7 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+16] + vbroadcasti128 ymm4, OWORD PTR [r15+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+32] + vbroadcasti128 ymm4, OWORD PTR [r15+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+48] + vbroadcasti128 ymm4, OWORD PTR [r15+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+64] + vbroadcasti128 ymm4, OWORD PTR [r15+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+80] + vbroadcasti128 ymm4, OWORD PTR [r15+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+96] + vbroadcasti128 ymm4, OWORD PTR [r15+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+112] + vbroadcasti128 ymm4, OWORD PTR [r15+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+128] + vbroadcasti128 ymm4, OWORD PTR [r15+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+144] + vbroadcasti128 ymm4, OWORD PTR [r15+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 11 - vbroadcasti128 ymm4, [r15+160] + vbroadcasti128 ymm4, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+176] + vbroadcasti128 ymm4, OWORD PTR [r15+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 13 - vbroadcasti128 ymm4, [r15+192] + vbroadcasti128 ymm4, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+208] + vbroadcasti128 ymm4, OWORD PTR [r15+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+224] + vbroadcasti128 ymm4, OWORD PTR [r15+224] L_AES_GCM_decrypt_vaes_l2_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -19475,7 +19418,7 @@ L_AES_GCM_decrypt_vaes_after_256: vextracti128 xmm0, ymm13, 1 vpxor xmm15, xmm13, xmm0 vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [rsp+512] + vbroadcasti128 ymm4, OWORD PTR [rsp+512] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -19487,81 +19430,81 @@ L_AES_GCM_decrypt_vaes_after_256: vmovdqu xmm7, OWORD PTR [rsp+512] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [rsp+512], xmm7 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+16] + vbroadcasti128 ymm4, OWORD PTR [r15+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+32] + vbroadcasti128 ymm4, OWORD PTR [r15+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+48] + vbroadcasti128 ymm4, OWORD PTR [r15+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+64] + vbroadcasti128 ymm4, OWORD PTR [r15+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+80] + vbroadcasti128 ymm4, OWORD PTR [r15+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+96] + vbroadcasti128 ymm4, OWORD PTR [r15+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+112] + vbroadcasti128 ymm4, OWORD PTR [r15+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+128] + vbroadcasti128 ymm4, OWORD PTR [r15+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+144] + vbroadcasti128 ymm4, OWORD PTR [r15+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 11 - vbroadcasti128 ymm4, [r15+160] + vbroadcasti128 ymm4, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+176] + vbroadcasti128 ymm4, OWORD PTR [r15+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r10d, 13 - vbroadcasti128 ymm4, [r15+192] + vbroadcasti128 ymm4, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+208] + vbroadcasti128 ymm4, OWORD PTR [r15+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [r15+224] + vbroadcasti128 ymm4, OWORD PTR [r15+224] L_AES_GCM_decrypt_vaes_t_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -20618,7 +20561,7 @@ L_AES_GCM_encrypt_update_vaes_loop_256: ; 256 bytes of input lea rsi, QWORD PTR [r10+rdi] vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -20630,81 +20573,81 @@ L_AES_GCM_encrypt_update_vaes_loop_256: vmovdqu xmm7, OWORD PTR [r15] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [r15], xmm7 - vbroadcasti128 ymm4, [rax] + vbroadcasti128 ymm4, OWORD PTR [rax] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+16] + vbroadcasti128 ymm4, OWORD PTR [rax+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+32] + vbroadcasti128 ymm4, OWORD PTR [rax+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+48] + vbroadcasti128 ymm4, OWORD PTR [rax+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+64] + vbroadcasti128 ymm4, OWORD PTR [rax+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+80] + vbroadcasti128 ymm4, OWORD PTR [rax+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+96] + vbroadcasti128 ymm4, OWORD PTR [rax+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+112] + vbroadcasti128 ymm4, OWORD PTR [rax+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+128] + vbroadcasti128 ymm4, OWORD PTR [rax+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+144] + vbroadcasti128 ymm4, OWORD PTR [rax+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 11 - vbroadcasti128 ymm4, [rax+160] + vbroadcasti128 ymm4, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+176] + vbroadcasti128 ymm4, OWORD PTR [rax+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 13 - vbroadcasti128 ymm4, [rax+192] + vbroadcasti128 ymm4, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+208] + vbroadcasti128 ymm4, OWORD PTR [rax+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+224] + vbroadcasti128 ymm4, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -20725,7 +20668,7 @@ L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last: vpxor ymm3, ymm3, ymm5 vmovdqu YMMWORD PTR [rdx+96], ymm3 add edi, 128 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -20737,81 +20680,81 @@ L_AES_GCM_encrypt_update_vaes_p1_vaes_ctr8_last: vmovdqu xmm7, OWORD PTR [r15] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [r15], xmm7 - vbroadcasti128 ymm4, [rax] + vbroadcasti128 ymm4, OWORD PTR [rax] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+16] + vbroadcasti128 ymm4, OWORD PTR [rax+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+32] + vbroadcasti128 ymm4, OWORD PTR [rax+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+48] + vbroadcasti128 ymm4, OWORD PTR [rax+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+64] + vbroadcasti128 ymm4, OWORD PTR [rax+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+80] + vbroadcasti128 ymm4, OWORD PTR [rax+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+96] + vbroadcasti128 ymm4, OWORD PTR [rax+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+112] + vbroadcasti128 ymm4, OWORD PTR [rax+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+128] + vbroadcasti128 ymm4, OWORD PTR [rax+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+144] + vbroadcasti128 ymm4, OWORD PTR [rax+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 11 - vbroadcasti128 ymm4, [rax+160] + vbroadcasti128 ymm4, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+176] + vbroadcasti128 ymm4, OWORD PTR [rax+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 13 - vbroadcasti128 ymm4, [rax+192] + vbroadcasti128 ymm4, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+208] + vbroadcasti128 ymm4, OWORD PTR [rax+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+224] + vbroadcasti128 ymm4, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_vaes_p2_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -20943,7 +20886,7 @@ L_AES_GCM_encrypt_update_vaes_after_256: ; 128 bytes of input lea rsi, QWORD PTR [r10+rdi] vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -20955,81 +20898,81 @@ L_AES_GCM_encrypt_update_vaes_after_256: vmovdqu xmm7, OWORD PTR [r15] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [r15], xmm7 - vbroadcasti128 ymm4, [rax] + vbroadcasti128 ymm4, OWORD PTR [rax] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+16] + vbroadcasti128 ymm4, OWORD PTR [rax+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+32] + vbroadcasti128 ymm4, OWORD PTR [rax+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+48] + vbroadcasti128 ymm4, OWORD PTR [rax+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+64] + vbroadcasti128 ymm4, OWORD PTR [rax+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+80] + vbroadcasti128 ymm4, OWORD PTR [rax+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+96] + vbroadcasti128 ymm4, OWORD PTR [rax+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+112] + vbroadcasti128 ymm4, OWORD PTR [rax+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+128] + vbroadcasti128 ymm4, OWORD PTR [rax+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+144] + vbroadcasti128 ymm4, OWORD PTR [rax+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 11 - vbroadcasti128 ymm4, [rax+160] + vbroadcasti128 ymm4, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+176] + vbroadcasti128 ymm4, OWORD PTR [rax+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 13 - vbroadcasti128 ymm4, [rax+192] + vbroadcasti128 ymm4, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+208] + vbroadcasti128 ymm4, OWORD PTR [rax+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+224] + vbroadcasti128 ymm4, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_vaes_8_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -21778,7 +21721,7 @@ L_AES_GCM_decrypt_update_vaes_loop_256: vextracti128 xmm0, ymm13, 1 vpxor xmm15, xmm13, xmm0 vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -21790,81 +21733,81 @@ L_AES_GCM_decrypt_update_vaes_loop_256: vmovdqu xmm7, OWORD PTR [r15] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [r15], xmm7 - vbroadcasti128 ymm4, [rax] + vbroadcasti128 ymm4, OWORD PTR [rax] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+16] + vbroadcasti128 ymm4, OWORD PTR [rax+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+32] + vbroadcasti128 ymm4, OWORD PTR [rax+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+48] + vbroadcasti128 ymm4, OWORD PTR [rax+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+64] + vbroadcasti128 ymm4, OWORD PTR [rax+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+80] + vbroadcasti128 ymm4, OWORD PTR [rax+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+96] + vbroadcasti128 ymm4, OWORD PTR [rax+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+112] + vbroadcasti128 ymm4, OWORD PTR [rax+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+128] + vbroadcasti128 ymm4, OWORD PTR [rax+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+144] + vbroadcasti128 ymm4, OWORD PTR [rax+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 11 - vbroadcasti128 ymm4, [rax+160] + vbroadcasti128 ymm4, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+176] + vbroadcasti128 ymm4, OWORD PTR [rax+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 13 - vbroadcasti128 ymm4, [rax+192] + vbroadcasti128 ymm4, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+208] + vbroadcasti128 ymm4, OWORD PTR [rax+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+224] + vbroadcasti128 ymm4, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -21885,7 +21828,7 @@ L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last: vpxor ymm3, ymm3, ymm5 vmovdqu YMMWORD PTR [rdx+96], ymm3 add edi, 128 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -21897,81 +21840,81 @@ L_AES_GCM_decrypt_update_vaes_l1_vaes_ctr8_last: vmovdqu xmm7, OWORD PTR [r15] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [r15], xmm7 - vbroadcasti128 ymm4, [rax] + vbroadcasti128 ymm4, OWORD PTR [rax] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+16] + vbroadcasti128 ymm4, OWORD PTR [rax+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+32] + vbroadcasti128 ymm4, OWORD PTR [rax+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+48] + vbroadcasti128 ymm4, OWORD PTR [rax+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+64] + vbroadcasti128 ymm4, OWORD PTR [rax+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+80] + vbroadcasti128 ymm4, OWORD PTR [rax+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+96] + vbroadcasti128 ymm4, OWORD PTR [rax+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+112] + vbroadcasti128 ymm4, OWORD PTR [rax+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+128] + vbroadcasti128 ymm4, OWORD PTR [rax+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+144] + vbroadcasti128 ymm4, OWORD PTR [rax+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 11 - vbroadcasti128 ymm4, [rax+160] + vbroadcasti128 ymm4, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+176] + vbroadcasti128 ymm4, OWORD PTR [rax+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 13 - vbroadcasti128 ymm4, [rax+192] + vbroadcasti128 ymm4, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+208] + vbroadcasti128 ymm4, OWORD PTR [rax+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+224] + vbroadcasti128 ymm4, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_vaes_l2_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -22063,7 +22006,7 @@ L_AES_GCM_decrypt_update_vaes_after_256: vextracti128 xmm0, ymm13, 1 vpxor xmm15, xmm13, xmm0 vbroadcasti128 ymm6, ptr_L_vaes_aes_gcm_bswap_epi64 - vbroadcasti128 ymm4, [r15] + vbroadcasti128 ymm4, OWORD PTR [r15] vpaddd ymm0, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y0 vpshufb ymm0, ymm0, ymm6 vpaddd ymm1, ymm4, YMMWORD PTR L_vaes_aes_gcm_inc_y1 @@ -22075,81 +22018,81 @@ L_AES_GCM_decrypt_update_vaes_after_256: vmovdqu xmm7, OWORD PTR [r15] vpaddd xmm7, xmm7, OWORD PTR L_vaes_aes_gcm_eight vmovdqu OWORD PTR [r15], xmm7 - vbroadcasti128 ymm4, [rax] + vbroadcasti128 ymm4, OWORD PTR [rax] vpxor ymm0, ymm0, ymm4 vpxor ymm1, ymm1, ymm4 vpxor ymm2, ymm2, ymm4 vpxor ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+16] + vbroadcasti128 ymm4, OWORD PTR [rax+16] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+32] + vbroadcasti128 ymm4, OWORD PTR [rax+32] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+48] + vbroadcasti128 ymm4, OWORD PTR [rax+48] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+64] + vbroadcasti128 ymm4, OWORD PTR [rax+64] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+80] + vbroadcasti128 ymm4, OWORD PTR [rax+80] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+96] + vbroadcasti128 ymm4, OWORD PTR [rax+96] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+112] + vbroadcasti128 ymm4, OWORD PTR [rax+112] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+128] + vbroadcasti128 ymm4, OWORD PTR [rax+128] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+144] + vbroadcasti128 ymm4, OWORD PTR [rax+144] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 11 - vbroadcasti128 ymm4, [rax+160] + vbroadcasti128 ymm4, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+176] + vbroadcasti128 ymm4, OWORD PTR [rax+176] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 cmp r8d, 13 - vbroadcasti128 ymm4, [rax+192] + vbroadcasti128 ymm4, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+208] + vbroadcasti128 ymm4, OWORD PTR [rax+208] vaesenc ymm0, ymm0, ymm4 vaesenc ymm1, ymm1, ymm4 vaesenc ymm2, ymm2, ymm4 vaesenc ymm3, ymm3, ymm4 - vbroadcasti128 ymm4, [rax+224] + vbroadcasti128 ymm4, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_vaes_t_vaes_ctr8_last: vaesenclast ymm0, ymm0, ymm4 vaesenclast ymm1, ymm1, ymm4 @@ -22372,56 +22315,49 @@ ENDIF IFDEF HAVE_INTEL_AVX512 _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_inc_z0 QWORD \ - 0000000000000000h, 0000000000000000h, - 0000000000000000h, 0000000000000001h, - 0000000000000000h, 0000000000000002h, - 0000000000000000h, 0000000000000003h +L_avx512_aes_gcm_inc_z0 QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000001h + QWORD 0000000000000000h, 0000000000000002h + QWORD 0000000000000000h, 0000000000000003h ptr_L_avx512_aes_gcm_inc_z0 QWORD L_avx512_aes_gcm_inc_z0 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_inc_z1 QWORD \ - 0000000000000000h, 0000000000000004h, - 0000000000000000h, 0000000000000005h, - 0000000000000000h, 0000000000000006h, - 0000000000000000h, 0000000000000007h +L_avx512_aes_gcm_inc_z1 QWORD 0000000000000000h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000007h ptr_L_avx512_aes_gcm_inc_z1 QWORD L_avx512_aes_gcm_inc_z1 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_inc_z2 QWORD \ - 0000000000000000h, 0000000000000008h, - 0000000000000000h, 0000000000000009h, - 0000000000000000h, 000000000000000ah, - 0000000000000000h, 000000000000000bh +L_avx512_aes_gcm_inc_z2 QWORD 0000000000000000h, 0000000000000008h + QWORD 0000000000000000h, 0000000000000009h + QWORD 0000000000000000h, 000000000000000ah + QWORD 0000000000000000h, 000000000000000bh ptr_L_avx512_aes_gcm_inc_z2 QWORD L_avx512_aes_gcm_inc_z2 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_inc_z3 QWORD \ - 0000000000000000h, 000000000000000ch, - 0000000000000000h, 000000000000000dh, - 0000000000000000h, 000000000000000eh, - 0000000000000000h, 000000000000000fh +L_avx512_aes_gcm_inc_z3 QWORD 0000000000000000h, 000000000000000ch + QWORD 0000000000000000h, 000000000000000dh + QWORD 0000000000000000h, 000000000000000eh + QWORD 0000000000000000h, 000000000000000fh ptr_L_avx512_aes_gcm_inc_z3 QWORD L_avx512_aes_gcm_inc_z3 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_sixteen QWORD \ - 0000000000000000h, 0000000000000010h +L_avx512_aes_gcm_sixteen QWORD 0000000000000000h, 0000000000000010h ptr_L_avx512_aes_gcm_sixteen QWORD L_avx512_aes_gcm_sixteen _DATA ENDS _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_avx512_rev8 QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_GCM_generate_m0_avx512_rev8 QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_GCM_generate_m0_avx512_rev8 QWORD L_GCM_generate_m0_avx512_rev8 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_GCM_generate_m0_avx512_mod2_128 QWORD \ - 0000000000000000h, 0e100000000000000h +L_GCM_generate_m0_avx512_mod2_128 QWORD 0000000000000000h, 0e100000000000000h ptr_L_GCM_generate_m0_avx512_mod2_128 QWORD L_GCM_generate_m0_avx512_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -22635,68 +22571,57 @@ GCM_generate_m0_avx512 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_one QWORD \ - 0000000000000000h, 0000000000000001h +L_avx512_aes_gcm_one QWORD 0000000000000000h, 0000000000000001h ptr_L_avx512_aes_gcm_one QWORD L_avx512_aes_gcm_one _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_two QWORD \ - 0000000000000000h, 0000000000000002h +L_avx512_aes_gcm_two QWORD 0000000000000000h, 0000000000000002h ptr_L_avx512_aes_gcm_two QWORD L_avx512_aes_gcm_two _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_three QWORD \ - 0000000000000000h, 0000000000000003h +L_avx512_aes_gcm_three QWORD 0000000000000000h, 0000000000000003h ptr_L_avx512_aes_gcm_three QWORD L_avx512_aes_gcm_three _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_four QWORD \ - 0000000000000000h, 0000000000000004h +L_avx512_aes_gcm_four QWORD 0000000000000000h, 0000000000000004h ptr_L_avx512_aes_gcm_four QWORD L_avx512_aes_gcm_four _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_five QWORD \ - 0000000000000000h, 0000000000000005h +L_avx512_aes_gcm_five QWORD 0000000000000000h, 0000000000000005h ptr_L_avx512_aes_gcm_five QWORD L_avx512_aes_gcm_five _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_six QWORD \ - 0000000000000000h, 0000000000000006h +L_avx512_aes_gcm_six QWORD 0000000000000000h, 0000000000000006h ptr_L_avx512_aes_gcm_six QWORD L_avx512_aes_gcm_six _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_seven QWORD \ - 0000000000000000h, 0000000000000007h +L_avx512_aes_gcm_seven QWORD 0000000000000000h, 0000000000000007h ptr_L_avx512_aes_gcm_seven QWORD L_avx512_aes_gcm_seven _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_eight QWORD \ - 0000000000000000h, 0000000000000008h +L_avx512_aes_gcm_eight QWORD 0000000000000000h, 0000000000000008h ptr_L_avx512_aes_gcm_eight QWORD L_avx512_aes_gcm_eight _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_bswap_epi64 QWORD \ - 0001020304050607h, 08090a0b0c0d0e0fh +L_avx512_aes_gcm_bswap_epi64 QWORD 0001020304050607h, 08090a0b0c0d0e0fh ptr_L_avx512_aes_gcm_bswap_epi64 QWORD L_avx512_aes_gcm_bswap_epi64 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_bswap_mask QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_avx512_aes_gcm_bswap_mask QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_avx512_aes_gcm_bswap_mask QWORD L_avx512_aes_gcm_bswap_mask _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_gcm_mod2_128 QWORD \ - 0000000000000001h, 0c200000000000000h +L_avx512_aes_gcm_mod2_128 QWORD 0000000000000001h, 0c200000000000000h ptr_L_avx512_aes_gcm_mod2_128 QWORD L_avx512_aes_gcm_mod2_128 _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -23692,16 +23617,16 @@ L_AES_GCM_encrypt_avx512_no_ext: vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 - vbroadcasti32x4 zmm9, [r15] - vbroadcasti32x4 zmm10, [r15+16] - vbroadcasti32x4 zmm11, [r15+32] - vbroadcasti32x4 zmm12, [r15+48] - vbroadcasti32x4 zmm13, [r15+64] - vbroadcasti32x4 zmm14, [r15+80] - vbroadcasti32x4 zmm15, [r15+96] - vbroadcasti32x4 zmm1, [r15+112] - vbroadcasti32x4 zmm2, [r15+128] - vbroadcasti32x4 zmm3, [r15+144] + vbroadcasti32x4 zmm9, OWORD PTR [r15] + vbroadcasti32x4 zmm10, OWORD PTR [r15+16] + vbroadcasti32x4 zmm11, OWORD PTR [r15+32] + vbroadcasti32x4 zmm12, OWORD PTR [r15+48] + vbroadcasti32x4 zmm13, OWORD PTR [r15+64] + vbroadcasti32x4 zmm14, OWORD PTR [r15+80] + vbroadcasti32x4 zmm15, OWORD PTR [r15+96] + vbroadcasti32x4 zmm1, OWORD PTR [r15+112] + vbroadcasti32x4 zmm2, OWORD PTR [r15+128] + vbroadcasti32x4 zmm3, OWORD PTR [r15+144] cmp r9d, 512 jl L_AES_GCM_encrypt_avx512_no_windows mov r13d, r9d @@ -23733,7 +23658,7 @@ L_AES_GCM_encrypt_avx512_no_ext: ; 512 bytes of input lea rcx, QWORD PTR [rsi+rbx] mov QWORD PTR [rsp+1056], rcx - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -23786,30 +23711,30 @@ L_AES_GCM_encrypt_avx512_no_ext: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -23830,7 +23755,7 @@ L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add ebx, 256 - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -23883,30 +23808,30 @@ L_AES_GCM_encrypt_avx512_p1_avx512_ctr16_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_encrypt_avx512_p2_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -23935,7 +23860,7 @@ L_AES_GCM_encrypt_avx512_win_loop: mov r12, QWORD PTR [rsp+1056] vpxorq zmm21, zmm21, zmm21 vinserti32x4 zmm21, zmm21, xmm6, 0 - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -24025,30 +23950,30 @@ L_AES_GCM_encrypt_avx512_win_loop: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_encrypt_avx512_a_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -24069,7 +23994,7 @@ L_AES_GCM_encrypt_avx512_a_il_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add ebx, 256 - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -24158,30 +24083,30 @@ L_AES_GCM_encrypt_avx512_a_il_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_encrypt_avx512_b_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -24328,7 +24253,7 @@ L_AES_GCM_encrypt_avx512_no_windows: cmp ebx, r13d jge L_AES_GCM_encrypt_avx512_after_256 ; 256 bytes of input - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -24381,30 +24306,30 @@ L_AES_GCM_encrypt_avx512_no_windows: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -24429,7 +24354,7 @@ L_AES_GCM_encrypt_avx512_pro_avx512_ctr16_last: cmp ebx, r13d jge L_AES_GCM_encrypt_avx512_last_ghash L_AES_GCM_encrypt_avx512_ghash_128: - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -24482,30 +24407,30 @@ L_AES_GCM_encrypt_avx512_ghash_128: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_encrypt_avx512_pip_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -25882,16 +25807,16 @@ L_AES_GCM_decrypt_avx512_no_ext: vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 - vbroadcasti32x4 zmm9, [r15] - vbroadcasti32x4 zmm10, [r15+16] - vbroadcasti32x4 zmm11, [r15+32] - vbroadcasti32x4 zmm12, [r15+48] - vbroadcasti32x4 zmm13, [r15+64] - vbroadcasti32x4 zmm14, [r15+80] - vbroadcasti32x4 zmm15, [r15+96] - vbroadcasti32x4 zmm1, [r15+112] - vbroadcasti32x4 zmm2, [r15+128] - vbroadcasti32x4 zmm3, [r15+144] + vbroadcasti32x4 zmm9, OWORD PTR [r15] + vbroadcasti32x4 zmm10, OWORD PTR [r15+16] + vbroadcasti32x4 zmm11, OWORD PTR [r15+32] + vbroadcasti32x4 zmm12, OWORD PTR [r15+48] + vbroadcasti32x4 zmm13, OWORD PTR [r15+64] + vbroadcasti32x4 zmm14, OWORD PTR [r15+80] + vbroadcasti32x4 zmm15, OWORD PTR [r15+96] + vbroadcasti32x4 zmm1, OWORD PTR [r15+112] + vbroadcasti32x4 zmm2, OWORD PTR [r15+128] + vbroadcasti32x4 zmm3, OWORD PTR [r15+144] cmp r9d, 512 jl L_AES_GCM_decrypt_avx512_no_windows mov r13d, r9d @@ -26024,7 +25949,7 @@ L_AES_GCM_decrypt_avx512_win_loop: lea rax, QWORD PTR [rdi+rbx] vpxorq zmm21, zmm21, zmm21 vinserti32x4 zmm21, zmm21, xmm6, 0 - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -26114,30 +26039,30 @@ L_AES_GCM_decrypt_avx512_win_loop: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_decrypt_avx512_a_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -26158,7 +26083,7 @@ L_AES_GCM_decrypt_avx512_a_il_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add r12d, 256 - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -26247,30 +26172,30 @@ L_AES_GCM_decrypt_avx512_a_il_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_decrypt_avx512_b_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -26307,7 +26232,7 @@ L_AES_GCM_decrypt_avx512_b_il_last: cmp ebx, r13d jl L_AES_GCM_decrypt_avx512_win_loop L_AES_GCM_decrypt_avx512_last_aes: - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -26360,30 +26285,30 @@ L_AES_GCM_decrypt_avx512_last_aes: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -26404,7 +26329,7 @@ L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add r12d, 256 - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -26457,30 +26382,30 @@ L_AES_GCM_decrypt_avx512_l1_avx512_ctr16_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_decrypt_avx512_l2_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -26566,7 +26491,7 @@ L_AES_GCM_decrypt_avx512_no_windows: vextracti32x4 xmm5, zmm29, 3 vpxorq xmm6, xmm29, xmm0 vpternlogq xmm6, xmm5, xmm4, 150 - vbroadcasti32x4 zmm20, [rsp+1024] + vbroadcasti32x4 zmm20, OWORD PTR [rsp+1024] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -26619,30 +26544,30 @@ L_AES_GCM_decrypt_avx512_no_windows: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r10d, 11 - vbroadcasti32x4 zmm20, [r15+160] + vbroadcasti32x4 zmm20, OWORD PTR [r15+160] jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+176] + vbroadcasti32x4 zmm20, OWORD PTR [r15+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r10d, 13 - vbroadcasti32x4 zmm20, [r15+192] + vbroadcasti32x4 zmm20, OWORD PTR [r15+192] jl L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+208] + vbroadcasti32x4 zmm20, OWORD PTR [r15+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [r15+224] + vbroadcasti32x4 zmm20, OWORD PTR [r15+224] L_AES_GCM_decrypt_avx512_t_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -27943,16 +27868,16 @@ L_AES_GCM_encrypt_update_avx512_no_ext: vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 - vbroadcasti32x4 zmm9, [rax] - vbroadcasti32x4 zmm10, [rax+16] - vbroadcasti32x4 zmm11, [rax+32] - vbroadcasti32x4 zmm12, [rax+48] - vbroadcasti32x4 zmm13, [rax+64] - vbroadcasti32x4 zmm14, [rax+80] - vbroadcasti32x4 zmm15, [rax+96] - vbroadcasti32x4 zmm1, [rax+112] - vbroadcasti32x4 zmm2, [rax+128] - vbroadcasti32x4 zmm3, [rax+144] + vbroadcasti32x4 zmm9, OWORD PTR [rax] + vbroadcasti32x4 zmm10, OWORD PTR [rax+16] + vbroadcasti32x4 zmm11, OWORD PTR [rax+32] + vbroadcasti32x4 zmm12, OWORD PTR [rax+48] + vbroadcasti32x4 zmm13, OWORD PTR [rax+64] + vbroadcasti32x4 zmm14, OWORD PTR [rax+80] + vbroadcasti32x4 zmm15, OWORD PTR [rax+96] + vbroadcasti32x4 zmm1, OWORD PTR [rax+112] + vbroadcasti32x4 zmm2, OWORD PTR [rax+128] + vbroadcasti32x4 zmm3, OWORD PTR [rax+144] cmp r9d, 512 jl L_AES_GCM_encrypt_update_avx512_no_windows mov ebp, r9d @@ -27983,7 +27908,7 @@ L_AES_GCM_encrypt_update_avx512_no_ext: vmovdqu64 [rsp+960], zmm26 ; 512 bytes of input lea rsi, QWORD PTR [r10+rdi] - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -28036,30 +27961,30 @@ L_AES_GCM_encrypt_update_avx512_no_ext: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -28080,7 +28005,7 @@ L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add edi, 256 - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -28133,30 +28058,30 @@ L_AES_GCM_encrypt_update_avx512_p1_avx512_ctr16_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_avx512_p2_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -28183,7 +28108,7 @@ L_AES_GCM_encrypt_update_avx512_win_loop: lea rbx, QWORD PTR [r10+rdi] vpxorq zmm21, zmm21, zmm21 vinserti32x4 zmm21, zmm21, xmm6, 0 - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -28273,30 +28198,30 @@ L_AES_GCM_encrypt_update_avx512_win_loop: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_avx512_a_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -28317,7 +28242,7 @@ L_AES_GCM_encrypt_update_avx512_a_il_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add edi, 256 - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -28406,30 +28331,30 @@ L_AES_GCM_encrypt_update_avx512_a_il_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_avx512_b_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -28574,7 +28499,7 @@ L_AES_GCM_encrypt_update_avx512_no_windows: cmp edi, r13d jge L_AES_GCM_encrypt_update_avx512_after_256 ; 256 bytes of input - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -28627,30 +28552,30 @@ L_AES_GCM_encrypt_update_avx512_no_windows: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -28675,7 +28600,7 @@ L_AES_GCM_encrypt_update_avx512_pro_avx512_ctr16_last: cmp edi, r13d jge L_AES_GCM_encrypt_update_avx512_last_ghash L_AES_GCM_encrypt_update_avx512_ghash_128: - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -28728,30 +28653,30 @@ L_AES_GCM_encrypt_update_avx512_ghash_128: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_encrypt_update_avx512_pip_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -29686,16 +29611,16 @@ L_AES_GCM_decrypt_update_avx512_no_ext: vbroadcasti32x4 zmm22, ptr_L_avx512_aes_gcm_bswap_epi64 vbroadcasti32x4 zmm30, ptr_L_avx512_aes_gcm_bswap_mask vbroadcasti32x4 zmm31, ptr_L_avx512_aes_gcm_mod2_128 - vbroadcasti32x4 zmm9, [rax] - vbroadcasti32x4 zmm10, [rax+16] - vbroadcasti32x4 zmm11, [rax+32] - vbroadcasti32x4 zmm12, [rax+48] - vbroadcasti32x4 zmm13, [rax+64] - vbroadcasti32x4 zmm14, [rax+80] - vbroadcasti32x4 zmm15, [rax+96] - vbroadcasti32x4 zmm1, [rax+112] - vbroadcasti32x4 zmm2, [rax+128] - vbroadcasti32x4 zmm3, [rax+144] + vbroadcasti32x4 zmm9, OWORD PTR [rax] + vbroadcasti32x4 zmm10, OWORD PTR [rax+16] + vbroadcasti32x4 zmm11, OWORD PTR [rax+32] + vbroadcasti32x4 zmm12, OWORD PTR [rax+48] + vbroadcasti32x4 zmm13, OWORD PTR [rax+64] + vbroadcasti32x4 zmm14, OWORD PTR [rax+80] + vbroadcasti32x4 zmm15, OWORD PTR [rax+96] + vbroadcasti32x4 zmm1, OWORD PTR [rax+112] + vbroadcasti32x4 zmm2, OWORD PTR [rax+128] + vbroadcasti32x4 zmm3, OWORD PTR [rax+144] cmp r9d, 512 jl L_AES_GCM_decrypt_update_avx512_no_windows mov r13d, r9d @@ -29828,7 +29753,7 @@ L_AES_GCM_decrypt_update_avx512_win_loop: lea rbx, QWORD PTR [r11+rdi] vpxorq zmm21, zmm21, zmm21 vinserti32x4 zmm21, zmm21, xmm6, 0 - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -29918,30 +29843,30 @@ L_AES_GCM_decrypt_update_avx512_win_loop: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_avx512_a_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_avx512_a_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -29962,7 +29887,7 @@ L_AES_GCM_decrypt_update_avx512_a_il_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add esi, 256 - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -30051,30 +29976,30 @@ L_AES_GCM_decrypt_update_avx512_a_il_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_avx512_b_il_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_avx512_b_il_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -30111,7 +30036,7 @@ L_AES_GCM_decrypt_update_avx512_b_il_last: cmp edi, r13d jl L_AES_GCM_decrypt_update_avx512_win_loop L_AES_GCM_decrypt_update_avx512_last_aes: - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -30164,30 +30089,30 @@ L_AES_GCM_decrypt_update_avx512_last_aes: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -30208,7 +30133,7 @@ L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last: vpxorq zmm19, zmm19, zmm21 vmovdqu64 [rdx+192], zmm19 add esi, 256 - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -30261,30 +30186,30 @@ L_AES_GCM_decrypt_update_avx512_l1_avx512_ctr16_last: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_avx512_l2_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 @@ -30370,7 +30295,7 @@ L_AES_GCM_decrypt_update_avx512_no_windows: vextracti32x4 xmm5, zmm29, 3 vpxorq xmm6, xmm29, xmm0 vpternlogq xmm6, xmm5, xmm4, 150 - vbroadcasti32x4 zmm20, [r15] + vbroadcasti32x4 zmm20, OWORD PTR [r15] vpaddd zmm16, zmm20, ptr_L_avx512_aes_gcm_inc_z0 vpshufb zmm16, zmm16, zmm22 vpaddd zmm17, zmm20, ptr_L_avx512_aes_gcm_inc_z1 @@ -30423,30 +30348,30 @@ L_AES_GCM_decrypt_update_avx512_no_windows: vaesenc zmm18, zmm18, zmm3 vaesenc zmm19, zmm19, zmm3 cmp r8d, 11 - vbroadcasti32x4 zmm20, [rax+160] + vbroadcasti32x4 zmm20, OWORD PTR [rax+160] jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+176] + vbroadcasti32x4 zmm20, OWORD PTR [rax+176] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 cmp r8d, 13 - vbroadcasti32x4 zmm20, [rax+192] + vbroadcasti32x4 zmm20, OWORD PTR [rax+192] jl L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+208] + vbroadcasti32x4 zmm20, OWORD PTR [rax+208] vaesenc zmm16, zmm16, zmm20 vaesenc zmm17, zmm17, zmm20 vaesenc zmm18, zmm18, zmm20 vaesenc zmm19, zmm19, zmm20 - vbroadcasti32x4 zmm20, [rax+224] + vbroadcasti32x4 zmm20, OWORD PTR [rax+224] L_AES_GCM_decrypt_update_avx512_t_avx512_ctr16_last: vaesenclast zmm16, zmm16, zmm20 vaesenclast zmm17, zmm17, zmm20 diff --git a/wolfcrypt/src/aes_gcm_x86_asm.asm b/wolfcrypt/src/aes_gcm_x86_asm.asm new file mode 100644 index 00000000000..e5fe2d87eda --- /dev/null +++ b/wolfcrypt/src/aes_gcm_x86_asm.asm @@ -0,0 +1,12921 @@ +; /* aes_gcm_x86_asm +; * +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN32 +_WIN32 = 1 +ENDIF + +.686P +.XMM +.MODEL FLAT, C + +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_one DWORD 00000000h, 00000000h, 00000001h, 00000000h +ptr_L_aes_gcm_one QWORD L_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_two DWORD 00000000h, 00000000h, 00000002h, 00000000h +ptr_L_aes_gcm_two QWORD L_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_three DWORD 00000000h, 00000000h, 00000003h, 00000000h +ptr_L_aes_gcm_three QWORD L_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_four DWORD 00000000h, 00000000h, 00000004h, 00000000h +ptr_L_aes_gcm_four QWORD L_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_bswap_epi64 DWORD 04050607h, 00010203h, 0c0d0e0fh, 08090a0bh +ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_bswap_mask DWORD 0c0d0e0fh, 08090a0bh, 04050607h, 00010203h +ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_mod2_128 DWORD 00000001h, 00000000h, 00000000h, 0c2000000h +ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx1_one DWORD 00000000h, 00000000h, 00000001h, 00000000h +ptr_L_aes_gcm_avx1_one QWORD L_aes_gcm_avx1_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx1_two DWORD 00000000h, 00000000h, 00000002h, 00000000h +ptr_L_aes_gcm_avx1_two QWORD L_aes_gcm_avx1_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx1_three DWORD 00000000h, 00000000h, 00000003h, 00000000h +ptr_L_aes_gcm_avx1_three QWORD L_aes_gcm_avx1_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx1_four DWORD 00000000h, 00000000h, 00000004h, 00000000h +ptr_L_aes_gcm_avx1_four QWORD L_aes_gcm_avx1_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx1_bswap_epi64 DWORD 04050607h, 00010203h, 0c0d0e0fh, 08090a0bh +ptr_L_aes_gcm_avx1_bswap_epi64 QWORD L_aes_gcm_avx1_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx1_bswap_mask DWORD 0c0d0e0fh, 08090a0bh, 04050607h, 00010203h +ptr_L_aes_gcm_avx1_bswap_mask QWORD L_aes_gcm_avx1_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx1_mod2_128 DWORD 00000001h, 00000000h, 00000000h, 0c2000000h +ptr_L_aes_gcm_avx1_mod2_128 QWORD L_aes_gcm_avx1_mod2_128 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx2_one DWORD 00000000h, 00000000h, 00000001h, 00000000h +ptr_L_aes_gcm_avx2_one QWORD L_aes_gcm_avx2_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx2_two DWORD 00000000h, 00000000h, 00000002h, 00000000h +ptr_L_aes_gcm_avx2_two QWORD L_aes_gcm_avx2_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx2_three DWORD 00000000h, 00000000h, 00000003h, 00000000h +ptr_L_aes_gcm_avx2_three QWORD L_aes_gcm_avx2_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx2_four DWORD 00000000h, 00000000h, 00000004h, 00000000h +ptr_L_aes_gcm_avx2_four QWORD L_aes_gcm_avx2_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_bswap_one DWORD 00000000h, 00000000h, 00000000h, 01000000h +ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx2_bswap_epi64 DWORD 04050607h, 00010203h, 0c0d0e0fh, 08090a0bh +ptr_L_aes_gcm_avx2_bswap_epi64 QWORD L_aes_gcm_avx2_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx2_bswap_mask DWORD 0c0d0e0fh, 08090a0bh, 04050607h, 00010203h +ptr_L_aes_gcm_avx2_bswap_mask QWORD L_aes_gcm_avx2_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_avx2_mod2_128 DWORD 00000001h, 00000000h, 00000000h, 0c2000000h +ptr_L_aes_gcm_avx2_mod2_128 QWORD L_aes_gcm_avx2_mod2_128 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_aesni PROC + push ebx + push esi + push edi + push ebp + sub esp, 112 + mov esi, DWORD PTR [esp+144] + mov ebp, DWORD PTR [esp+168] + mov edx, DWORD PTR [esp+160] + pxor xmm0, xmm0 + pxor xmm2, xmm2 + cmp edx, 12 + jne L_AES_GCM_encrypt_aesni_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + pinsrd xmm0, DWORD PTR [esi], 0 + pinsrd xmm0, DWORD PTR [esi+4], 1 + pinsrd xmm0, DWORD PTR [esi+8], 2 + pinsrd xmm0, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + movdqa xmm5, xmm0 + movdqa xmm1, OWORD PTR [ebp] + pxor xmm5, xmm1 + movdqa xmm3, OWORD PTR [ebp+16] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+32] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+48] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+64] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+80] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+96] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+112] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+128] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+144] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + cmp DWORD PTR [esp+172], 11 + movdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_calc_iv_12_last + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+176] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + cmp DWORD PTR [esp+172], 13 + movdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_calc_iv_12_last + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+208] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_calc_iv_12_last: + aesenclast xmm1, xmm3 + aesenclast xmm5, xmm3 + pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask + movdqu OWORD PTR [esp+80], xmm5 + jmp L_AES_GCM_encrypt_aesni_iv_done +L_AES_GCM_encrypt_aesni_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + movdqa xmm1, OWORD PTR [ebp] + aesenc xmm1, [ebp+16] + aesenc xmm1, [ebp+32] + aesenc xmm1, [ebp+48] + aesenc xmm1, [ebp+64] + aesenc xmm1, [ebp+80] + aesenc xmm1, [ebp+96] + aesenc xmm1, [ebp+112] + aesenc xmm1, [ebp+128] + aesenc xmm1, [ebp+144] + cmp DWORD PTR [esp+172], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm1, xmm5 + aesenc xmm1, [ebp+176] + cmp DWORD PTR [esp+172], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm1, xmm5 + aesenc xmm1, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_calc_iv_1_aesenc_avx_last: + aesenclast xmm1, xmm5 + pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_encrypt_aesni_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_aesni_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_aesni_calc_iv_16_loop: + movdqu xmm4, OWORD PTR [esi+ecx] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm0 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm0, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm0, xmm6 + por xmm3, xmm4 + por xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm0, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_aesni_calc_iv_16_loop + mov edx, DWORD PTR [esp+160] + cmp ecx, edx + je L_AES_GCM_encrypt_aesni_calc_iv_done +L_AES_GCM_encrypt_aesni_calc_iv_lt16: + sub esp, 16 + pxor xmm4, xmm4 + xor ebx, ebx + movdqu OWORD PTR [esp], xmm4 +L_AES_GCM_encrypt_aesni_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_aesni_calc_iv_loop + movdqu xmm4, OWORD PTR [esp] + add esp, 16 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm0 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm0, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm0, xmm6 + por xmm3, xmm4 + por xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm0, xmm6 +L_AES_GCM_encrypt_aesni_calc_iv_done: + ; T = Encrypt counter + pxor xmm4, xmm4 + shl edx, 3 + pinsrd xmm4, edx, 0 + pxor xmm0, xmm4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm0 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm0, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm0, xmm6 + por xmm3, xmm4 + por xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm0, xmm6 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + ; Encrypt counter + movdqa xmm4, OWORD PTR [ebp] + pxor xmm4, xmm0 + aesenc xmm4, [ebp+16] + aesenc xmm4, [ebp+32] + aesenc xmm4, [ebp+48] + aesenc xmm4, [ebp+64] + aesenc xmm4, [ebp+80] + aesenc xmm4, [ebp+96] + aesenc xmm4, [ebp+112] + aesenc xmm4, [ebp+128] + aesenc xmm4, [ebp+144] + cmp DWORD PTR [esp+172], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+176] + cmp DWORD PTR [esp+172], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_calc_iv_2_aesenc_avx_last: + aesenclast xmm4, xmm5 + movdqu OWORD PTR [esp+80], xmm4 +L_AES_GCM_encrypt_aesni_iv_done: + mov esi, DWORD PTR [esp+140] + ; Additional authentication data + mov edx, DWORD PTR [esp+156] + cmp edx, 0 + je L_AES_GCM_encrypt_aesni_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_aesni_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_aesni_calc_aad_16_loop: + movdqu xmm4, OWORD PTR [esi+ecx] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm4 + pshufd xmm5, xmm2, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm2, 17 + pclmulqdq xmm4, xmm2, 0 + pxor xmm5, xmm2 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm2 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm2, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm2, xmm6 + por xmm3, xmm4 + por xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm2, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_aesni_calc_aad_16_loop + mov edx, DWORD PTR [esp+156] + cmp ecx, edx + je L_AES_GCM_encrypt_aesni_calc_aad_done +L_AES_GCM_encrypt_aesni_calc_aad_lt16: + sub esp, 16 + pxor xmm4, xmm4 + xor ebx, ebx + movdqu OWORD PTR [esp], xmm4 +L_AES_GCM_encrypt_aesni_calc_aad_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_aesni_calc_aad_loop + movdqu xmm4, OWORD PTR [esp] + add esp, 16 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm4 + pshufd xmm5, xmm2, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm2, 17 + pclmulqdq xmm4, xmm2, 0 + pxor xmm5, xmm2 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm2 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm2, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm2, xmm6 + por xmm3, xmm4 + por xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm2, xmm6 +L_AES_GCM_encrypt_aesni_calc_aad_done: + movdqu OWORD PTR [esp+96], xmm2 + mov esi, DWORD PTR [esp+132] + mov edi, DWORD PTR [esp+136] + ; Calculate counter and H + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm5, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_one + movdqa xmm4, xmm1 + movdqu OWORD PTR [esp+64], xmm0 + psrlq xmm5, 63 + psllq xmm4, 1 + pslldq xmm5, 8 + por xmm4, xmm5 + pshufd xmm1, xmm1, 255 + psrad xmm1, 31 + pand xmm1, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm1, xmm4 + xor ebx, ebx + mov eax, DWORD PTR [esp+152] + cmp eax, 64 + jl L_AES_GCM_encrypt_aesni_done_64 + and eax, 4294967232 + movdqa xmm6, xmm2 + ; H ^ 1 + movdqu OWORD PTR [esp], xmm1 + ; H ^ 2 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm0, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm0, xmm5 + movdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm0, 78 + movdqa xmm7, xmm0 + movdqa xmm4, xmm0 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm0 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm3, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [esp+32], xmm3 + ; H ^ 4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm0, 78 + movdqa xmm7, xmm0 + movdqa xmm4, xmm0 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm0 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm3, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [esp+48], xmm3 + ; First 64 bytes of input + ; Encrypt 64 bytes of counter + movdqu xmm4, OWORD PTR [esp+64] + movdqu xmm3, xmm4 + paddd xmm3, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm3 + movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pshufb xmm4, xmm3 + paddd xmm5, OWORD PTR L_aes_gcm_one + pshufb xmm5, xmm3 + paddd xmm6, OWORD PTR L_aes_gcm_two + pshufb xmm6, xmm3 + paddd xmm7, OWORD PTR L_aes_gcm_three + pshufb xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp] + pxor xmm4, xmm3 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+16] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+32] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+48] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+64] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+80] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+96] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+112] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+128] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+144] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+172], 11 + movdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_enc_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+176] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+172], 13 + movdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_enc_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+208] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_enc_done: + aesenclast xmm4, xmm3 + aesenclast xmm5, xmm3 + movdqu xmm0, OWORD PTR [esi] + movdqu xmm1, OWORD PTR [esi+16] + pxor xmm4, xmm0 + pxor xmm5, xmm1 + movdqu OWORD PTR [edi], xmm4 + movdqu OWORD PTR [edi+16], xmm5 + aesenclast xmm6, xmm3 + aesenclast xmm7, xmm3 + movdqu xmm0, OWORD PTR [esi+32] + movdqu xmm1, OWORD PTR [esi+48] + pxor xmm6, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [edi+32], xmm6 + movdqu OWORD PTR [edi+48], xmm7 + cmp eax, 64 + mov ebx, 64 + mov ecx, esi + mov edx, edi + jle L_AES_GCM_encrypt_aesni_end_64 + ; More 64 bytes of input +L_AES_GCM_encrypt_aesni_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; Encrypt 64 bytes of counter + movdqu xmm4, OWORD PTR [esp+64] + movdqu xmm3, xmm4 + paddd xmm3, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm3 + movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pshufb xmm4, xmm3 + paddd xmm5, OWORD PTR L_aes_gcm_one + pshufb xmm5, xmm3 + paddd xmm6, OWORD PTR L_aes_gcm_two + pshufb xmm6, xmm3 + paddd xmm7, OWORD PTR L_aes_gcm_three + pshufb xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp] + pxor xmm4, xmm3 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+16] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+32] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+48] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+64] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+80] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+96] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+112] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+128] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+144] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+172], 11 + movdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+176] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+172], 13 + movdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+208] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_aesenc_64_ghash_avx_done: + aesenclast xmm4, xmm3 + aesenclast xmm5, xmm3 + movdqu xmm0, OWORD PTR [ecx] + movdqu xmm1, OWORD PTR [ecx+16] + pxor xmm4, xmm0 + pxor xmm5, xmm1 + movdqu OWORD PTR [edx], xmm4 + movdqu OWORD PTR [edx+16], xmm5 + aesenclast xmm6, xmm3 + aesenclast xmm7, xmm3 + movdqu xmm0, OWORD PTR [ecx+32] + movdqu xmm1, OWORD PTR [ecx+48] + pxor xmm6, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [edx+32], xmm6 + movdqu OWORD PTR [edx+48], xmm7 + ; ghash encrypted counter + movdqu xmm6, OWORD PTR [esp+96] + movdqu xmm3, OWORD PTR [esp+48] + movdqu xmm4, OWORD PTR [edx+-64] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm6 + pshufd xmm5, xmm3, 78 + pshufd xmm1, xmm4, 78 + pxor xmm5, xmm3 + pxor xmm1, xmm4 + movdqa xmm7, xmm4 + pclmulqdq xmm7, xmm3, 17 + movdqa xmm6, xmm4 + pclmulqdq xmm6, xmm3, 0 + pclmulqdq xmm5, xmm1, 0 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqu xmm3, OWORD PTR [esp+32] + movdqu xmm4, OWORD PTR [edx+-48] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqu xmm3, OWORD PTR [esp+16] + movdqu xmm4, OWORD PTR [edx+-32] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqu xmm3, OWORD PTR [esp] + movdqu xmm4, OWORD PTR [edx+-16] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqa xmm1, xmm5 + psrldq xmm5, 8 + pslldq xmm1, 8 + pxor xmm6, xmm1 + pxor xmm7, xmm5 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + pslld xmm3, 31 + pslld xmm0, 30 + pslld xmm1, 25 + pxor xmm3, xmm0 + pxor xmm3, xmm1 + movdqa xmm0, xmm3 + pslldq xmm3, 12 + psrldq xmm0, 4 + pxor xmm6, xmm3 + movdqa xmm1, xmm6 + movdqa xmm5, xmm6 + movdqa xmm4, xmm6 + psrld xmm1, 1 + psrld xmm5, 2 + psrld xmm4, 7 + pxor xmm1, xmm5 + pxor xmm1, xmm4 + pxor xmm1, xmm0 + pxor xmm6, xmm1 + pxor xmm6, xmm7 + movdqu OWORD PTR [esp+96], xmm6 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_encrypt_aesni_ghash_64 +L_AES_GCM_encrypt_aesni_end_64: + movdqu xmm2, OWORD PTR [esp+96] + ; Block 1 + movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm1, OWORD PTR [edx] + pshufb xmm1, xmm4 + movdqu xmm3, OWORD PTR [esp+48] + pxor xmm1, xmm2 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm3, 78 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm3 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm0, xmm4 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm0, xmm6 + pxor xmm2, xmm5 + ; Block 2 + movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm1, OWORD PTR [edx+16] + pshufb xmm1, xmm4 + movdqu xmm3, OWORD PTR [esp+32] + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm3, 78 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm3 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + pxor xmm0, xmm4 + pxor xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm0, xmm6 + pxor xmm2, xmm5 + ; Block 3 + movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm1, OWORD PTR [edx+32] + pshufb xmm1, xmm4 + movdqu xmm3, OWORD PTR [esp+16] + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm3, 78 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm3 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + pxor xmm0, xmm4 + pxor xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm0, xmm6 + pxor xmm2, xmm5 + ; Block 4 + movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm1, OWORD PTR [edx+48] + pshufb xmm1, xmm4 + movdqu xmm3, OWORD PTR [esp] + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm3, 78 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm3 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + pxor xmm0, xmm4 + pxor xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm0, xmm6 + pxor xmm2, xmm5 + movdqa xmm4, xmm0 + movdqa xmm5, xmm0 + movdqa xmm6, xmm0 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm0, xmm4 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm4, xmm0 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm0 + pxor xmm2, xmm6 + movdqu xmm1, OWORD PTR [esp] +L_AES_GCM_encrypt_aesni_done_64: + mov edx, DWORD PTR [esp+152] + cmp ebx, edx + jge L_AES_GCM_encrypt_aesni_done_enc + mov eax, DWORD PTR [esp+152] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_encrypt_aesni_last_block_done + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + movdqu xmm4, OWORD PTR [esp+64] + movdqa xmm5, xmm4 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm5, OWORD PTR L_aes_gcm_one + pxor xmm4, [ebp] + movdqu OWORD PTR [esp+64], xmm5 + aesenc xmm4, [ebp+16] + aesenc xmm4, [ebp+32] + aesenc xmm4, [ebp+48] + aesenc xmm4, [ebp+64] + aesenc xmm4, [ebp+80] + aesenc xmm4, [ebp+96] + aesenc xmm4, [ebp+112] + aesenc xmm4, [ebp+128] + aesenc xmm4, [ebp+144] + cmp DWORD PTR [esp+172], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+176] + cmp DWORD PTR [esp+172], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_aesenc_block_aesenc_avx_last: + aesenclast xmm4, xmm5 + movdqu xmm5, OWORD PTR [ecx] + pxor xmm4, xmm5 + movdqu OWORD PTR [edx], xmm4 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm4 + add ebx, 16 + cmp ebx, eax + jge L_AES_GCM_encrypt_aesni_last_block_ghash +L_AES_GCM_encrypt_aesni_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + movdqu xmm4, OWORD PTR [esp+64] + movdqa xmm5, xmm4 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm5, OWORD PTR L_aes_gcm_one + pxor xmm4, [ebp] + movdqu OWORD PTR [esp+64], xmm5 + movdqu xmm0, xmm2 + pclmulqdq xmm0, xmm1, 16 + aesenc xmm4, [ebp+16] + aesenc xmm4, [ebp+32] + movdqu xmm3, xmm2 + pclmulqdq xmm3, xmm1, 1 + aesenc xmm4, [ebp+48] + aesenc xmm4, [ebp+64] + aesenc xmm4, [ebp+80] + movdqu xmm5, xmm2 + pclmulqdq xmm5, xmm1, 17 + aesenc xmm4, [ebp+96] + pxor xmm0, xmm3 + movdqa xmm6, xmm0 + psrldq xmm0, 8 + pslldq xmm6, 8 + aesenc xmm4, [ebp+112] + movdqu xmm3, xmm2 + pclmulqdq xmm3, xmm1, 0 + pxor xmm6, xmm3 + pxor xmm5, xmm0 + movdqa xmm7, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm3, xmm6 + pclmulqdq xmm3, xmm7, 16 + aesenc xmm4, [ebp+128] + pshufd xmm0, xmm6, 78 + pxor xmm0, xmm3 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 16 + aesenc xmm4, [ebp+144] + pshufd xmm2, xmm0, 78 + pxor xmm2, xmm3 + pxor xmm2, xmm5 + cmp DWORD PTR [esp+172], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_aesenc_gfmul_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+176] + cmp DWORD PTR [esp+172], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_aesenc_gfmul_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_aesenc_gfmul_last: + aesenclast xmm4, xmm5 + movdqu xmm5, OWORD PTR [ecx] + pxor xmm4, xmm5 + movdqu OWORD PTR [edx], xmm4 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm4 + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_encrypt_aesni_last_block_start +L_AES_GCM_encrypt_aesni_last_block_ghash: + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm2, 78 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm2 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm2, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm2, xmm5 +L_AES_GCM_encrypt_aesni_last_block_done: + mov ecx, DWORD PTR [esp+152] + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_done + movdqu xmm0, OWORD PTR [esp+64] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + pxor xmm0, [ebp] + aesenc xmm0, [ebp+16] + aesenc xmm0, [ebp+32] + aesenc xmm0, [ebp+48] + aesenc xmm0, [ebp+64] + aesenc xmm0, [ebp+80] + aesenc xmm0, [ebp+96] + aesenc xmm0, [ebp+112] + aesenc xmm0, [ebp+128] + aesenc xmm0, [ebp+144] + cmp DWORD PTR [esp+172], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last + aesenc xmm0, xmm5 + aesenc xmm0, [ebp+176] + cmp DWORD PTR [esp+172], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last + aesenc xmm0, xmm5 + aesenc xmm0, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_aesenc_avx_last: + aesenclast xmm0, xmm5 + sub esp, 16 + xor ecx, ecx + movdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_loop: + movzx eax, BYTE PTR [esi+ebx] + xor al, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ebx], al + mov BYTE PTR [esp+ecx], al + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_loop + xor eax, eax + cmp ecx, 16 + je L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [esp+ecx], al + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_finish_enc: + movdqu xmm0, OWORD PTR [esp] + add esp, 16 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm0 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm2, 78 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm2 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm2, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm2, xmm5 +L_AES_GCM_encrypt_aesni_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_aesni_done_enc: + mov edi, DWORD PTR [esp+148] + mov ebx, DWORD PTR [esp+164] + mov edx, DWORD PTR [esp+152] + mov ecx, DWORD PTR [esp+156] + shl edx, 3 + shl ecx, 3 + pinsrd xmm4, edx, 0 + pinsrd xmm4, ecx, 2 + mov edx, DWORD PTR [esp+152] + mov ecx, DWORD PTR [esp+156] + shr edx, 29 + shr ecx, 29 + pinsrd xmm4, edx, 1 + pinsrd xmm4, ecx, 3 + pxor xmm2, xmm4 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm2, 78 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm2 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm2, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pshufb xmm2, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm4, OWORD PTR [esp+80] + pxor xmm4, xmm2 + cmp ebx, 16 + je L_AES_GCM_encrypt_aesni_store_tag_16 + xor ecx, ecx + movdqu OWORD PTR [esp], xmm4 +L_AES_GCM_encrypt_aesni_store_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ecx], al + inc ecx + cmp ecx, ebx + jne L_AES_GCM_encrypt_aesni_store_tag_loop + jmp L_AES_GCM_encrypt_aesni_store_tag_done +L_AES_GCM_encrypt_aesni_store_tag_16: + movdqu OWORD PTR [edi], xmm4 +L_AES_GCM_encrypt_aesni_store_tag_done: + add esp, 112 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_encrypt_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_aesni PROC + push ebx + push esi + push edi + push ebp + sub esp, 176 + mov esi, DWORD PTR [esp+208] + mov ebp, DWORD PTR [esp+232] + mov edx, DWORD PTR [esp+224] + pxor xmm0, xmm0 + pxor xmm2, xmm2 + cmp edx, 12 + jne L_AES_GCM_decrypt_aesni_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + pinsrd xmm0, DWORD PTR [esi], 0 + pinsrd xmm0, DWORD PTR [esi+4], 1 + pinsrd xmm0, DWORD PTR [esi+8], 2 + pinsrd xmm0, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + movdqa xmm5, xmm0 + movdqa xmm1, OWORD PTR [ebp] + pxor xmm5, xmm1 + movdqa xmm3, OWORD PTR [ebp+16] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+32] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+48] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+64] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+80] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+96] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+112] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+128] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+144] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + cmp DWORD PTR [esp+236], 11 + movdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_aesni_calc_iv_12_last + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+176] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + cmp DWORD PTR [esp+236], 13 + movdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_aesni_calc_iv_12_last + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+208] + aesenc xmm1, xmm3 + aesenc xmm5, xmm3 + movdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_aesni_calc_iv_12_last: + aesenclast xmm1, xmm3 + aesenclast xmm5, xmm3 + pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask + movdqu OWORD PTR [esp+80], xmm5 + jmp L_AES_GCM_decrypt_aesni_iv_done +L_AES_GCM_decrypt_aesni_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + movdqa xmm1, OWORD PTR [ebp] + aesenc xmm1, [ebp+16] + aesenc xmm1, [ebp+32] + aesenc xmm1, [ebp+48] + aesenc xmm1, [ebp+64] + aesenc xmm1, [ebp+80] + aesenc xmm1, [ebp+96] + aesenc xmm1, [ebp+112] + aesenc xmm1, [ebp+128] + aesenc xmm1, [ebp+144] + cmp DWORD PTR [esp+236], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm1, xmm5 + aesenc xmm1, [ebp+176] + cmp DWORD PTR [esp+236], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm1, xmm5 + aesenc xmm1, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_aesni_calc_iv_1_aesenc_avx_last: + aesenclast xmm1, xmm5 + pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_decrypt_aesni_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_aesni_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_aesni_calc_iv_16_loop: + movdqu xmm4, OWORD PTR [esi+ecx] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm0 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm0, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm0, xmm6 + por xmm3, xmm4 + por xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm0, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_aesni_calc_iv_16_loop + mov edx, DWORD PTR [esp+224] + cmp ecx, edx + je L_AES_GCM_decrypt_aesni_calc_iv_done +L_AES_GCM_decrypt_aesni_calc_iv_lt16: + sub esp, 16 + pxor xmm4, xmm4 + xor ebx, ebx + movdqu OWORD PTR [esp], xmm4 +L_AES_GCM_decrypt_aesni_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_aesni_calc_iv_loop + movdqu xmm4, OWORD PTR [esp] + add esp, 16 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm0 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm0, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm0, xmm6 + por xmm3, xmm4 + por xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm0, xmm6 +L_AES_GCM_decrypt_aesni_calc_iv_done: + ; T = Encrypt counter + pxor xmm4, xmm4 + shl edx, 3 + pinsrd xmm4, edx, 0 + pxor xmm0, xmm4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm0 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm0, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm0, xmm6 + por xmm3, xmm4 + por xmm0, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm0, xmm6 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + ; Encrypt counter + movdqa xmm4, OWORD PTR [ebp] + pxor xmm4, xmm0 + aesenc xmm4, [ebp+16] + aesenc xmm4, [ebp+32] + aesenc xmm4, [ebp+48] + aesenc xmm4, [ebp+64] + aesenc xmm4, [ebp+80] + aesenc xmm4, [ebp+96] + aesenc xmm4, [ebp+112] + aesenc xmm4, [ebp+128] + aesenc xmm4, [ebp+144] + cmp DWORD PTR [esp+236], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+176] + cmp DWORD PTR [esp+236], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_aesni_calc_iv_2_aesenc_avx_last: + aesenclast xmm4, xmm5 + movdqu OWORD PTR [esp+80], xmm4 +L_AES_GCM_decrypt_aesni_iv_done: + mov esi, DWORD PTR [esp+204] + ; Additional authentication data + mov edx, DWORD PTR [esp+220] + cmp edx, 0 + je L_AES_GCM_decrypt_aesni_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_aesni_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_aesni_calc_aad_16_loop: + movdqu xmm4, OWORD PTR [esi+ecx] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm4 + pshufd xmm5, xmm2, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm2, 17 + pclmulqdq xmm4, xmm2, 0 + pxor xmm5, xmm2 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm2 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm2, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm2, xmm6 + por xmm3, xmm4 + por xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm2, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_aesni_calc_aad_16_loop + mov edx, DWORD PTR [esp+220] + cmp ecx, edx + je L_AES_GCM_decrypt_aesni_calc_aad_done +L_AES_GCM_decrypt_aesni_calc_aad_lt16: + sub esp, 16 + pxor xmm4, xmm4 + xor ebx, ebx + movdqu OWORD PTR [esp], xmm4 +L_AES_GCM_decrypt_aesni_calc_aad_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_aesni_calc_aad_loop + movdqu xmm4, OWORD PTR [esp] + add esp, 16 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm4 + pshufd xmm5, xmm2, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm2, 17 + pclmulqdq xmm4, xmm2, 0 + pxor xmm5, xmm2 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm4 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm3, xmm6 + pxor xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm2 + psrld xmm4, 31 + psrld xmm5, 31 + pslld xmm3, 1 + pslld xmm2, 1 + movdqa xmm6, xmm4 + pslldq xmm4, 4 + psrldq xmm6, 12 + pslldq xmm5, 4 + por xmm2, xmm6 + por xmm3, xmm4 + por xmm2, xmm5 + movdqa xmm4, xmm3 + movdqa xmm5, xmm3 + movdqa xmm6, xmm3 + pslld xmm4, 31 + pslld xmm5, 30 + pslld xmm6, 25 + pxor xmm4, xmm5 + pxor xmm4, xmm6 + movdqa xmm5, xmm4 + psrldq xmm5, 4 + pslldq xmm4, 12 + pxor xmm3, xmm4 + movdqa xmm6, xmm3 + movdqa xmm7, xmm3 + movdqa xmm4, xmm3 + psrld xmm6, 1 + psrld xmm7, 2 + psrld xmm4, 7 + pxor xmm6, xmm7 + pxor xmm6, xmm4 + pxor xmm6, xmm5 + pxor xmm6, xmm3 + pxor xmm2, xmm6 +L_AES_GCM_decrypt_aesni_calc_aad_done: + movdqu OWORD PTR [esp+96], xmm2 + mov esi, DWORD PTR [esp+196] + mov edi, DWORD PTR [esp+200] + ; Calculate counter and H + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm5, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_one + movdqa xmm4, xmm1 + movdqu OWORD PTR [esp+64], xmm0 + psrlq xmm5, 63 + psllq xmm4, 1 + pslldq xmm5, 8 + por xmm4, xmm5 + pshufd xmm1, xmm1, 255 + psrad xmm1, 31 + pand xmm1, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm1, xmm4 + xor ebx, ebx + cmp DWORD PTR [esp+216], 64 + mov eax, DWORD PTR [esp+216] + jl L_AES_GCM_decrypt_aesni_done_64 + and eax, 4294967232 + movdqa xmm6, xmm2 + ; H ^ 1 + movdqu OWORD PTR [esp], xmm1 + ; H ^ 2 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm1, 78 + movdqa xmm7, xmm1 + movdqa xmm4, xmm1 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm1 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm0, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm0, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm0, xmm5 + movdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm0, 78 + movdqa xmm7, xmm0 + movdqa xmm4, xmm0 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm0 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm3, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [esp+32], xmm3 + ; H ^ 4 + pshufd xmm5, xmm0, 78 + pshufd xmm6, xmm0, 78 + movdqa xmm7, xmm0 + movdqa xmm4, xmm0 + pclmulqdq xmm7, xmm0, 17 + pclmulqdq xmm4, xmm0, 0 + pxor xmm5, xmm0 + pxor xmm6, xmm0 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm3, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm3, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [esp+48], xmm3 + cmp edi, esi + jne L_AES_GCM_decrypt_aesni_ghash_64 +L_AES_GCM_decrypt_aesni_ghash_64_inplace: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; Encrypt 64 bytes of counter + movdqu xmm4, OWORD PTR [esp+64] + movdqu xmm3, xmm4 + paddd xmm3, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm3 + movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pshufb xmm4, xmm3 + paddd xmm5, OWORD PTR L_aes_gcm_one + pshufb xmm5, xmm3 + paddd xmm6, OWORD PTR L_aes_gcm_two + pshufb xmm6, xmm3 + paddd xmm7, OWORD PTR L_aes_gcm_three + pshufb xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp] + pxor xmm4, xmm3 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+16] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+32] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+48] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+64] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+80] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+96] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+112] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+128] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+144] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+236], 11 + movdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+176] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+236], 13 + movdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+208] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_aesniinplace_aesenc_64_ghash_avx_done: + aesenclast xmm4, xmm3 + aesenclast xmm5, xmm3 + movdqu xmm0, OWORD PTR [ecx] + movdqu xmm1, OWORD PTR [ecx+16] + pxor xmm4, xmm0 + pxor xmm5, xmm1 + movdqu OWORD PTR [esp+112], xmm0 + movdqu OWORD PTR [esp+128], xmm1 + movdqu OWORD PTR [edx], xmm4 + movdqu OWORD PTR [edx+16], xmm5 + aesenclast xmm6, xmm3 + aesenclast xmm7, xmm3 + movdqu xmm0, OWORD PTR [ecx+32] + movdqu xmm1, OWORD PTR [ecx+48] + pxor xmm6, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [esp+144], xmm0 + movdqu OWORD PTR [esp+160], xmm1 + movdqu OWORD PTR [edx+32], xmm6 + movdqu OWORD PTR [edx+48], xmm7 + ; ghash encrypted counter + movdqu xmm6, OWORD PTR [esp+96] + movdqu xmm3, OWORD PTR [esp+48] + movdqu xmm4, OWORD PTR [esp+112] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm6 + pshufd xmm5, xmm3, 78 + pshufd xmm1, xmm4, 78 + pxor xmm5, xmm3 + pxor xmm1, xmm4 + movdqa xmm7, xmm4 + pclmulqdq xmm7, xmm3, 17 + movdqa xmm6, xmm4 + pclmulqdq xmm6, xmm3, 0 + pclmulqdq xmm5, xmm1, 0 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqu xmm3, OWORD PTR [esp+32] + movdqu xmm4, OWORD PTR [esp+128] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqu xmm3, OWORD PTR [esp+16] + movdqu xmm4, OWORD PTR [esp+144] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqu xmm3, OWORD PTR [esp] + movdqu xmm4, OWORD PTR [esp+160] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqa xmm1, xmm5 + psrldq xmm5, 8 + pslldq xmm1, 8 + pxor xmm6, xmm1 + pxor xmm7, xmm5 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + pslld xmm3, 31 + pslld xmm0, 30 + pslld xmm1, 25 + pxor xmm3, xmm0 + pxor xmm3, xmm1 + movdqa xmm0, xmm3 + pslldq xmm3, 12 + psrldq xmm0, 4 + pxor xmm6, xmm3 + movdqa xmm1, xmm6 + movdqa xmm5, xmm6 + movdqa xmm4, xmm6 + psrld xmm1, 1 + psrld xmm5, 2 + psrld xmm4, 7 + pxor xmm1, xmm5 + pxor xmm1, xmm4 + pxor xmm1, xmm0 + pxor xmm6, xmm1 + pxor xmm6, xmm7 + movdqu OWORD PTR [esp+96], xmm6 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_aesni_ghash_64_inplace + jmp L_AES_GCM_decrypt_aesni_ghash_64_done +L_AES_GCM_decrypt_aesni_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; Encrypt 64 bytes of counter + movdqu xmm4, OWORD PTR [esp+64] + movdqu xmm3, xmm4 + paddd xmm3, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm3 + movdqa xmm3, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pshufb xmm4, xmm3 + paddd xmm5, OWORD PTR L_aes_gcm_one + pshufb xmm5, xmm3 + paddd xmm6, OWORD PTR L_aes_gcm_two + pshufb xmm6, xmm3 + paddd xmm7, OWORD PTR L_aes_gcm_three + pshufb xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp] + pxor xmm4, xmm3 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+16] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+32] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+48] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+64] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+80] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+96] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+112] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+128] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+144] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+236], 11 + movdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+176] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + cmp DWORD PTR [esp+236], 13 + movdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+208] + aesenc xmm4, xmm3 + aesenc xmm5, xmm3 + aesenc xmm6, xmm3 + aesenc xmm7, xmm3 + movdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_aesni_aesenc_64_ghash_avx_done: + aesenclast xmm4, xmm3 + aesenclast xmm5, xmm3 + movdqu xmm0, OWORD PTR [ecx] + movdqu xmm1, OWORD PTR [ecx+16] + pxor xmm4, xmm0 + pxor xmm5, xmm1 + movdqu OWORD PTR [edx], xmm4 + movdqu OWORD PTR [edx+16], xmm5 + aesenclast xmm6, xmm3 + aesenclast xmm7, xmm3 + movdqu xmm0, OWORD PTR [ecx+32] + movdqu xmm1, OWORD PTR [ecx+48] + pxor xmm6, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [edx+32], xmm6 + movdqu OWORD PTR [edx+48], xmm7 + ; ghash encrypted counter + movdqu xmm6, OWORD PTR [esp+96] + movdqu xmm3, OWORD PTR [esp+48] + movdqu xmm4, OWORD PTR [ecx] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm6 + pshufd xmm5, xmm3, 78 + pshufd xmm1, xmm4, 78 + pxor xmm5, xmm3 + pxor xmm1, xmm4 + movdqa xmm7, xmm4 + pclmulqdq xmm7, xmm3, 17 + movdqa xmm6, xmm4 + pclmulqdq xmm6, xmm3, 0 + pclmulqdq xmm5, xmm1, 0 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqu xmm3, OWORD PTR [esp+32] + movdqu xmm4, OWORD PTR [ecx+16] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqu xmm3, OWORD PTR [esp+16] + movdqu xmm4, OWORD PTR [ecx+32] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqu xmm3, OWORD PTR [esp] + movdqu xmm4, OWORD PTR [ecx+48] + pshufd xmm0, xmm3, 78 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm3 + pshufd xmm1, xmm4, 78 + pxor xmm1, xmm4 + movdqa xmm2, xmm4 + pclmulqdq xmm2, xmm3, 17 + pclmulqdq xmm3, xmm4, 0 + pclmulqdq xmm0, xmm1, 0 + pxor xmm5, xmm3 + pxor xmm6, xmm3 + pxor xmm5, xmm2 + pxor xmm7, xmm2 + pxor xmm5, xmm0 + movdqa xmm1, xmm5 + psrldq xmm5, 8 + pslldq xmm1, 8 + pxor xmm6, xmm1 + pxor xmm7, xmm5 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + pslld xmm3, 31 + pslld xmm0, 30 + pslld xmm1, 25 + pxor xmm3, xmm0 + pxor xmm3, xmm1 + movdqa xmm0, xmm3 + pslldq xmm3, 12 + psrldq xmm0, 4 + pxor xmm6, xmm3 + movdqa xmm1, xmm6 + movdqa xmm5, xmm6 + movdqa xmm4, xmm6 + psrld xmm1, 1 + psrld xmm5, 2 + psrld xmm4, 7 + pxor xmm1, xmm5 + pxor xmm1, xmm4 + pxor xmm1, xmm0 + pxor xmm6, xmm1 + pxor xmm6, xmm7 + movdqu OWORD PTR [esp+96], xmm6 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_aesni_ghash_64 +L_AES_GCM_decrypt_aesni_ghash_64_done: + movdqa xmm2, xmm6 + movdqu xmm1, OWORD PTR [esp] +L_AES_GCM_decrypt_aesni_done_64: + mov edx, DWORD PTR [esp+216] + cmp ebx, edx + jge L_AES_GCM_decrypt_aesni_done_dec + mov eax, DWORD PTR [esp+216] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_decrypt_aesni_last_block_done +L_AES_GCM_decrypt_aesni_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + movdqu xmm5, OWORD PTR [ecx] + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm5, xmm2 + movdqu OWORD PTR [esp], xmm5 + movdqu xmm4, OWORD PTR [esp+64] + movdqa xmm5, xmm4 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm5, OWORD PTR L_aes_gcm_one + pxor xmm4, [ebp] + movdqu OWORD PTR [esp+64], xmm5 + movdqu xmm0, OWORD PTR [esp] + pclmulqdq xmm0, xmm1, 16 + aesenc xmm4, [ebp+16] + aesenc xmm4, [ebp+32] + movdqu xmm3, OWORD PTR [esp] + pclmulqdq xmm3, xmm1, 1 + aesenc xmm4, [ebp+48] + aesenc xmm4, [ebp+64] + aesenc xmm4, [ebp+80] + movdqu xmm5, OWORD PTR [esp] + pclmulqdq xmm5, xmm1, 17 + aesenc xmm4, [ebp+96] + pxor xmm0, xmm3 + movdqa xmm6, xmm0 + psrldq xmm0, 8 + pslldq xmm6, 8 + aesenc xmm4, [ebp+112] + movdqu xmm3, OWORD PTR [esp] + pclmulqdq xmm3, xmm1, 0 + pxor xmm6, xmm3 + pxor xmm5, xmm0 + movdqa xmm7, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm3, xmm6 + pclmulqdq xmm3, xmm7, 16 + aesenc xmm4, [ebp+128] + pshufd xmm0, xmm6, 78 + pxor xmm0, xmm3 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 16 + aesenc xmm4, [ebp+144] + pshufd xmm2, xmm0, 78 + pxor xmm2, xmm3 + pxor xmm2, xmm5 + cmp DWORD PTR [esp+236], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_aesni_aesenc_gfmul_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+176] + cmp DWORD PTR [esp+236], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_aesni_aesenc_gfmul_last + aesenc xmm4, xmm5 + aesenc xmm4, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_aesni_aesenc_gfmul_last: + aesenclast xmm4, xmm5 + movdqu xmm5, OWORD PTR [ecx] + pxor xmm4, xmm5 + movdqu OWORD PTR [edx], xmm4 + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_decrypt_aesni_last_block_start +L_AES_GCM_decrypt_aesni_last_block_done: + mov ecx, DWORD PTR [esp+216] + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_done + movdqu xmm0, OWORD PTR [esp+64] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + pxor xmm0, [ebp] + aesenc xmm0, [ebp+16] + aesenc xmm0, [ebp+32] + aesenc xmm0, [ebp+48] + aesenc xmm0, [ebp+64] + aesenc xmm0, [ebp+80] + aesenc xmm0, [ebp+96] + aesenc xmm0, [ebp+112] + aesenc xmm0, [ebp+128] + aesenc xmm0, [ebp+144] + cmp DWORD PTR [esp+236], 11 + movdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last + aesenc xmm0, xmm5 + aesenc xmm0, [ebp+176] + cmp DWORD PTR [esp+236], 13 + movdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last + aesenc xmm0, xmm5 + aesenc xmm0, [ebp+208] + movdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_aesenc_avx_last: + aesenclast xmm0, xmm5 + sub esp, 32 + xor ecx, ecx + movdqu OWORD PTR [esp], xmm0 + pxor xmm4, xmm4 + movdqu OWORD PTR [esp+16], xmm4 +L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop: + movzx eax, BYTE PTR [esi+ebx] + mov BYTE PTR [esp+ecx+16], al + xor al, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ebx], al + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_loop + movdqu xmm0, OWORD PTR [esp+16] + add esp, 32 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm2, xmm0 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm2, 78 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm2 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm2, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm2, xmm5 +L_AES_GCM_decrypt_aesni_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_aesni_done_dec: + mov esi, DWORD PTR [esp+212] + mov ebp, DWORD PTR [esp+228] + mov edx, DWORD PTR [esp+216] + mov ecx, DWORD PTR [esp+220] + shl edx, 3 + shl ecx, 3 + pinsrd xmm4, edx, 0 + pinsrd xmm4, ecx, 2 + mov edx, DWORD PTR [esp+216] + mov ecx, DWORD PTR [esp+220] + shr edx, 29 + shr ecx, 29 + pinsrd xmm4, edx, 1 + pinsrd xmm4, ecx, 3 + pxor xmm2, xmm4 + pshufd xmm5, xmm1, 78 + pshufd xmm6, xmm2, 78 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + pclmulqdq xmm7, xmm1, 17 + pclmulqdq xmm4, xmm1, 0 + pxor xmm5, xmm1 + pxor xmm6, xmm2 + pclmulqdq xmm5, xmm6, 0 + pxor xmm5, xmm4 + pxor xmm5, xmm7 + movdqa xmm6, xmm5 + movdqa xmm2, xmm7 + pslldq xmm6, 8 + psrldq xmm5, 8 + pxor xmm4, xmm6 + pxor xmm2, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + pslld xmm5, 31 + pslld xmm6, 30 + pslld xmm7, 25 + pxor xmm5, xmm6 + pxor xmm5, xmm7 + movdqa xmm7, xmm5 + psrldq xmm7, 4 + pslldq xmm5, 12 + pxor xmm4, xmm5 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + psrld xmm5, 1 + psrld xmm6, 2 + pxor xmm5, xmm6 + pxor xmm5, xmm4 + psrld xmm4, 7 + pxor xmm5, xmm7 + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pshufb xmm2, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm4, OWORD PTR [esp+80] + pxor xmm4, xmm2 + mov edi, DWORD PTR [esp+240] + cmp ebp, 16 + je L_AES_GCM_decrypt_aesni_cmp_tag_16 + sub esp, 16 + xor ecx, ecx + xor ebx, ebx + movdqu OWORD PTR [esp], xmm4 +L_AES_GCM_decrypt_aesni_cmp_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + xor al, BYTE PTR [esi+ecx] + or bl, al + inc ecx + cmp ecx, ebp + jne L_AES_GCM_decrypt_aesni_cmp_tag_loop + cmp bl, 0 + sete bl + add esp, 16 + xor ecx, ecx + jmp L_AES_GCM_decrypt_aesni_cmp_tag_done +L_AES_GCM_decrypt_aesni_cmp_tag_16: + movdqu xmm5, OWORD PTR [esi] + pcmpeqb xmm4, xmm5 + pmovmskb edx, xmm4 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_aesni_cmp_tag_done: + mov DWORD PTR [edi], ebx + add esp, 176 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_aesni ENDP +_TEXT ENDS +IFDEF WOLFSSL_AESGCM_STREAM +_TEXT SEGMENT READONLY PARA +AES_GCM_init_aesni PROC + push ebx + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+36] + mov esi, DWORD PTR [esp+44] + mov edi, DWORD PTR [esp+60] + pxor xmm4, xmm4 + mov edx, DWORD PTR [esp+48] + cmp edx, 12 + jne L_AES_GCM_init_aesni_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + pinsrd xmm4, DWORD PTR [esi], 0 + pinsrd xmm4, DWORD PTR [esi+4], 1 + pinsrd xmm4, DWORD PTR [esi+8], 2 + pinsrd xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + movdqa xmm1, xmm4 + movdqa xmm5, OWORD PTR [ebp] + pxor xmm1, xmm5 + movdqa xmm7, OWORD PTR [ebp+16] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+32] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+48] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+64] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+80] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+96] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+112] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+128] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+144] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp DWORD PTR [esp+40], 11 + movdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_init_aesni_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+176] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp DWORD PTR [esp+40], 13 + movdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_init_aesni_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+208] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_init_aesni_calc_iv_12_last: + aesenclast xmm5, xmm7 + aesenclast xmm1, xmm7 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + movdqu OWORD PTR [edi], xmm1 + jmp L_AES_GCM_init_aesni_iv_done +L_AES_GCM_init_aesni_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + movdqa xmm5, OWORD PTR [ebp] + aesenc xmm5, [ebp+16] + aesenc xmm5, [ebp+32] + aesenc xmm5, [ebp+48] + aesenc xmm5, [ebp+64] + aesenc xmm5, [ebp+80] + aesenc xmm5, [ebp+96] + aesenc xmm5, [ebp+112] + aesenc xmm5, [ebp+128] + aesenc xmm5, [ebp+144] + cmp DWORD PTR [esp+40], 11 + movdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm1 + aesenc xmm5, [ebp+176] + cmp DWORD PTR [esp+40], 13 + movdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm1 + aesenc xmm5, [ebp+208] + movdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: + aesenclast xmm5, xmm1 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_init_aesni_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_aesni_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_aesni_calc_iv_16_loop: + movdqu xmm0, OWORD PTR [esi+ecx] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm0 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_aesni_calc_iv_16_loop + mov edx, DWORD PTR [esp+48] + cmp ecx, edx + je L_AES_GCM_init_aesni_calc_iv_done +L_AES_GCM_init_aesni_calc_iv_lt16: + sub esp, 16 + pxor xmm0, xmm0 + xor ebx, ebx + movdqu OWORD PTR [esp], xmm0 +L_AES_GCM_init_aesni_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_init_aesni_calc_iv_loop + movdqu xmm0, OWORD PTR [esp] + add esp, 16 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm0 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 +L_AES_GCM_init_aesni_calc_iv_done: + ; T = Encrypt counter + pxor xmm0, xmm0 + shl edx, 3 + pinsrd xmm0, edx, 0 + pxor xmm4, xmm0 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + ; Encrypt counter + movdqa xmm0, OWORD PTR [ebp] + pxor xmm0, xmm4 + aesenc xmm0, [ebp+16] + aesenc xmm0, [ebp+32] + aesenc xmm0, [ebp+48] + aesenc xmm0, [ebp+64] + aesenc xmm0, [ebp+80] + aesenc xmm0, [ebp+96] + aesenc xmm0, [ebp+112] + aesenc xmm0, [ebp+128] + aesenc xmm0, [ebp+144] + cmp DWORD PTR [esp+40], 11 + movdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+176] + cmp DWORD PTR [esp+40], 13 + movdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+208] + movdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last: + aesenclast xmm0, xmm1 + movdqu OWORD PTR [edi], xmm0 +L_AES_GCM_init_aesni_iv_done: + mov ebp, DWORD PTR [esp+52] + mov edi, DWORD PTR [esp+56] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm4, OWORD PTR L_aes_gcm_one + movdqa OWORD PTR [ebp], xmm5 + movdqa OWORD PTR [edi], xmm4 + add esp, 16 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_init_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_aad_update_aesni PROC + push esi + push edi + mov esi, DWORD PTR [esp+12] + mov edx, DWORD PTR [esp+16] + mov edi, DWORD PTR [esp+20] + mov eax, DWORD PTR [esp+24] + movdqa xmm5, OWORD PTR [edi] + movdqa xmm6, OWORD PTR [eax] + xor ecx, ecx +L_AES_GCM_aad_update_aesni_16_loop: + movdqu xmm0, OWORD PTR [esi+ecx] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm5, xmm0 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm6, 78 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm0 + movdqa xmm5, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm4, 1 + pslld xmm5, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm5, xmm2 + por xmm4, xmm0 + por xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm4 + movdqa xmm2, xmm4 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm4, xmm0 + movdqa xmm2, xmm4 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm4 + pxor xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_aesni_16_loop + movdqa OWORD PTR [edi], xmm5 + pop edi + pop esi + ret +AES_GCM_aad_update_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_block_aesni PROC + push esi + push edi + mov ecx, DWORD PTR [esp+12] + mov eax, DWORD PTR [esp+16] + mov edi, DWORD PTR [esp+20] + mov esi, DWORD PTR [esp+24] + mov edx, DWORD PTR [esp+28] + movdqu xmm0, OWORD PTR [edx] + movdqa xmm1, xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm1, OWORD PTR L_aes_gcm_one + pxor xmm0, [ecx] + movdqu OWORD PTR [edx], xmm1 + aesenc xmm0, [ecx+16] + aesenc xmm0, [ecx+32] + aesenc xmm0, [ecx+48] + aesenc xmm0, [ecx+64] + aesenc xmm0, [ecx+80] + aesenc xmm0, [ecx+96] + aesenc xmm0, [ecx+112] + aesenc xmm0, [ecx+128] + aesenc xmm0, [ecx+144] + cmp eax, 11 + movdqa xmm1, OWORD PTR [ecx+160] + jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last + aesenc xmm0, xmm1 + aesenc xmm0, [ecx+176] + cmp eax, 13 + movdqa xmm1, OWORD PTR [ecx+192] + jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last + aesenc xmm0, xmm1 + aesenc xmm0, [ecx+208] + movdqa xmm1, OWORD PTR [ecx+224] +L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last: + aesenclast xmm0, xmm1 + movdqu xmm1, OWORD PTR [esi] + pxor xmm0, xmm1 + movdqu OWORD PTR [edi], xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pop edi + pop esi + ret +AES_GCM_encrypt_block_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_ghash_block_aesni PROC + mov edx, DWORD PTR [esp+4] + mov eax, DWORD PTR [esp+8] + mov ecx, DWORD PTR [esp+12] + movdqa xmm4, OWORD PTR [eax] + movdqa xmm5, OWORD PTR [ecx] + movdqu xmm0, OWORD PTR [edx] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm0 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm6, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm6, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm6 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm6, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm6, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + movdqa xmm2, xmm6 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm6, xmm0 + movdqa xmm2, xmm6 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm6 + pxor xmm4, xmm2 + movdqa OWORD PTR [eax], xmm4 + ret +AES_GCM_ghash_block_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_update_aesni PROC + push ebx + push esi + push edi + push ebp + sub esp, 96 + mov esi, DWORD PTR [esp+144] + movdqa xmm4, OWORD PTR [esi] + movdqu OWORD PTR [esp+64], xmm4 + mov esi, DWORD PTR [esp+136] + mov ebp, DWORD PTR [esp+140] + movdqa xmm6, OWORD PTR [esi] + movdqa xmm5, OWORD PTR [ebp] + movdqu OWORD PTR [esp+80], xmm6 + mov ebp, DWORD PTR [esp+116] + mov edi, DWORD PTR [esp+124] + mov esi, DWORD PTR [esp+128] + movdqa xmm1, xmm5 + movdqa xmm0, xmm5 + psrlq xmm1, 63 + psllq xmm0, 1 + pslldq xmm1, 8 + por xmm0, xmm1 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+132], 64 + mov eax, DWORD PTR [esp+132] + jl L_AES_GCM_encrypt_update_aesni_done_64 + and eax, 4294967232 + movdqa xmm2, xmm6 + ; H ^ 1 + movdqu OWORD PTR [esp], xmm5 + ; H ^ 2 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm4, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm4, xmm1 + movdqu OWORD PTR [esp+16], xmm4 + ; H ^ 3 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm4, 78 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm4 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm7, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [esp+32], xmm7 + ; H ^ 4 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm4, 78 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm7, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [esp+48], xmm7 + ; First 64 bytes of input + ; Encrypt 64 bytes of counter + movdqu xmm0, OWORD PTR [esp+64] + movdqu xmm7, xmm0 + paddd xmm7, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm7 + movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pshufb xmm0, xmm7 + paddd xmm1, OWORD PTR L_aes_gcm_one + pshufb xmm1, xmm7 + paddd xmm2, OWORD PTR L_aes_gcm_two + pshufb xmm2, xmm7 + paddd xmm3, OWORD PTR L_aes_gcm_three + pshufb xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp] + pxor xmm0, xmm7 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+16] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+32] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+48] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+64] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+80] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+96] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+112] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+128] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+144] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+120], 11 + movdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_aesni_enc_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+176] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+120], 13 + movdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_aesni_enc_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+208] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_aesni_enc_done: + aesenclast xmm0, xmm7 + aesenclast xmm1, xmm7 + movdqu xmm4, OWORD PTR [esi] + movdqu xmm5, OWORD PTR [esi+16] + pxor xmm0, xmm4 + pxor xmm1, xmm5 + movdqu OWORD PTR [edi], xmm0 + movdqu OWORD PTR [edi+16], xmm1 + aesenclast xmm2, xmm7 + aesenclast xmm3, xmm7 + movdqu xmm4, OWORD PTR [esi+32] + movdqu xmm5, OWORD PTR [esi+48] + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [edi+32], xmm2 + movdqu OWORD PTR [edi+48], xmm3 + cmp eax, 64 + mov ebx, 64 + mov ecx, esi + mov edx, edi + jle L_AES_GCM_encrypt_update_aesni_end_64 + ; More 64 bytes of input +L_AES_GCM_encrypt_update_aesni_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; Encrypt 64 bytes of counter + movdqu xmm0, OWORD PTR [esp+64] + movdqu xmm7, xmm0 + paddd xmm7, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm7 + movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pshufb xmm0, xmm7 + paddd xmm1, OWORD PTR L_aes_gcm_one + pshufb xmm1, xmm7 + paddd xmm2, OWORD PTR L_aes_gcm_two + pshufb xmm2, xmm7 + paddd xmm3, OWORD PTR L_aes_gcm_three + pshufb xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp] + pxor xmm0, xmm7 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+16] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+32] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+48] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+64] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+80] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+96] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+112] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+128] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+144] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+120], 11 + movdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+176] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+120], 13 + movdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+208] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done: + aesenclast xmm0, xmm7 + aesenclast xmm1, xmm7 + movdqu xmm4, OWORD PTR [ecx] + movdqu xmm5, OWORD PTR [ecx+16] + pxor xmm0, xmm4 + pxor xmm1, xmm5 + movdqu OWORD PTR [edx], xmm0 + movdqu OWORD PTR [edx+16], xmm1 + aesenclast xmm2, xmm7 + aesenclast xmm3, xmm7 + movdqu xmm4, OWORD PTR [ecx+32] + movdqu xmm5, OWORD PTR [ecx+48] + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [edx+32], xmm2 + movdqu OWORD PTR [edx+48], xmm3 + ; ghash encrypted counter + movdqu xmm2, OWORD PTR [esp+80] + movdqu xmm7, OWORD PTR [esp+48] + movdqu xmm0, OWORD PTR [edx+-64] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm2 + pshufd xmm1, xmm7, 78 + pshufd xmm5, xmm0, 78 + pxor xmm1, xmm7 + pxor xmm5, xmm0 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 17 + movdqa xmm2, xmm0 + pclmulqdq xmm2, xmm7, 0 + pclmulqdq xmm1, xmm5, 0 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqu xmm7, OWORD PTR [esp+32] + movdqu xmm0, OWORD PTR [edx+-48] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, OWORD PTR [esp+16] + movdqu xmm0, OWORD PTR [edx+-32] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, OWORD PTR [esp] + movdqu xmm0, OWORD PTR [edx+-16] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqa xmm5, xmm1 + psrldq xmm1, 8 + pslldq xmm5, 8 + pxor xmm2, xmm5 + pxor xmm3, xmm1 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + movdqa xmm5, xmm2 + pslld xmm7, 31 + pslld xmm4, 30 + pslld xmm5, 25 + pxor xmm7, xmm4 + pxor xmm7, xmm5 + movdqa xmm4, xmm7 + pslldq xmm7, 12 + psrldq xmm4, 4 + pxor xmm2, xmm7 + movdqa xmm5, xmm2 + movdqa xmm1, xmm2 + movdqa xmm0, xmm2 + psrld xmm5, 1 + psrld xmm1, 2 + psrld xmm0, 7 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm3 + movdqu OWORD PTR [esp+80], xmm2 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_encrypt_update_aesni_ghash_64 +L_AES_GCM_encrypt_update_aesni_end_64: + movdqu xmm6, OWORD PTR [esp+80] + ; Block 1 + movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm5, OWORD PTR [edx] + pshufb xmm5, xmm0 + movdqu xmm7, OWORD PTR [esp+48] + pxor xmm5, xmm6 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm0 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + ; Block 2 + movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm5, OWORD PTR [edx+16] + pshufb xmm5, xmm0 + movdqu xmm7, OWORD PTR [esp+32] + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + ; Block 3 + movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm5, OWORD PTR [edx+32] + pshufb xmm5, xmm0 + movdqu xmm7, OWORD PTR [esp+16] + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + ; Block 4 + movdqa xmm0, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm5, OWORD PTR [edx+48] + pshufb xmm5, xmm0 + movdqu xmm7, OWORD PTR [esp] + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm4 + movdqa xmm2, xmm4 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm4, xmm0 + movdqa xmm2, xmm4 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm4 + pxor xmm6, xmm2 + movdqu xmm5, OWORD PTR [esp] +L_AES_GCM_encrypt_update_aesni_done_64: + mov edx, DWORD PTR [esp+132] + cmp ebx, edx + jge L_AES_GCM_encrypt_update_aesni_done_enc + mov eax, DWORD PTR [esp+132] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_encrypt_update_aesni_last_block_done + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + movdqu xmm0, OWORD PTR [esp+64] + movdqa xmm1, xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm1, OWORD PTR L_aes_gcm_one + pxor xmm0, [ebp] + movdqu OWORD PTR [esp+64], xmm1 + aesenc xmm0, [ebp+16] + aesenc xmm0, [ebp+32] + aesenc xmm0, [ebp+48] + aesenc xmm0, [ebp+64] + aesenc xmm0, [ebp+80] + aesenc xmm0, [ebp+96] + aesenc xmm0, [ebp+112] + aesenc xmm0, [ebp+128] + aesenc xmm0, [ebp+144] + cmp DWORD PTR [esp+120], 11 + movdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+176] + cmp DWORD PTR [esp+120], 13 + movdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+208] + movdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last: + aesenclast xmm0, xmm1 + movdqu xmm1, OWORD PTR [ecx] + pxor xmm0, xmm1 + movdqu OWORD PTR [edx], xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm0 + add ebx, 16 + cmp ebx, eax + jge L_AES_GCM_encrypt_update_aesni_last_block_ghash +L_AES_GCM_encrypt_update_aesni_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + movdqu xmm0, OWORD PTR [esp+64] + movdqa xmm1, xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm1, OWORD PTR L_aes_gcm_one + pxor xmm0, [ebp] + movdqu OWORD PTR [esp+64], xmm1 + movdqu xmm4, xmm6 + pclmulqdq xmm4, xmm5, 16 + aesenc xmm0, [ebp+16] + aesenc xmm0, [ebp+32] + movdqu xmm7, xmm6 + pclmulqdq xmm7, xmm5, 1 + aesenc xmm0, [ebp+48] + aesenc xmm0, [ebp+64] + aesenc xmm0, [ebp+80] + movdqu xmm1, xmm6 + pclmulqdq xmm1, xmm5, 17 + aesenc xmm0, [ebp+96] + pxor xmm4, xmm7 + movdqa xmm2, xmm4 + psrldq xmm4, 8 + pslldq xmm2, 8 + aesenc xmm0, [ebp+112] + movdqu xmm7, xmm6 + pclmulqdq xmm7, xmm5, 0 + pxor xmm2, xmm7 + pxor xmm1, xmm4 + movdqa xmm3, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm7, xmm2 + pclmulqdq xmm7, xmm3, 16 + aesenc xmm0, [ebp+128] + pshufd xmm4, xmm2, 78 + pxor xmm4, xmm7 + movdqa xmm7, xmm4 + pclmulqdq xmm7, xmm3, 16 + aesenc xmm0, [ebp+144] + pshufd xmm6, xmm4, 78 + pxor xmm6, xmm7 + pxor xmm6, xmm1 + cmp DWORD PTR [esp+120], 11 + movdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+176] + cmp DWORD PTR [esp+120], 13 + movdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+208] + movdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last: + aesenclast xmm0, xmm1 + movdqu xmm1, OWORD PTR [ecx] + pxor xmm0, xmm1 + movdqu OWORD PTR [edx], xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm0 + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_encrypt_update_aesni_last_block_start +L_AES_GCM_encrypt_update_aesni_last_block_ghash: + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm6, 78 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm6, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm6, xmm1 +L_AES_GCM_encrypt_update_aesni_last_block_done: +L_AES_GCM_encrypt_update_aesni_done_enc: + mov esi, DWORD PTR [esp+136] + mov edi, DWORD PTR [esp+144] + movdqu xmm4, OWORD PTR [esp+64] + movdqa OWORD PTR [esi], xmm6 + movdqu OWORD PTR [edi], xmm4 + add esp, 96 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_encrypt_update_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_final_aesni PROC + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+32] + mov esi, DWORD PTR [esp+52] + mov edi, DWORD PTR [esp+56] + movdqa xmm4, OWORD PTR [ebp] + movdqa xmm5, OWORD PTR [esi] + movdqa xmm6, OWORD PTR [edi] + movdqa xmm1, xmm5 + movdqa xmm0, xmm5 + psrlq xmm1, 63 + psllq xmm0, 1 + pslldq xmm1, 8 + por xmm0, xmm1 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm0 + mov edx, DWORD PTR [esp+44] + mov ecx, DWORD PTR [esp+48] + shl edx, 3 + shl ecx, 3 + pinsrd xmm0, edx, 0 + pinsrd xmm0, ecx, 2 + mov edx, DWORD PTR [esp+44] + mov ecx, DWORD PTR [esp+48] + shr edx, 29 + shr ecx, 29 + pinsrd xmm0, edx, 1 + pinsrd xmm0, ecx, 3 + pxor xmm4, xmm0 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm4, 78 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm4 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm4, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm4, xmm1 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm0, xmm6 + pxor xmm0, xmm4 + mov edi, DWORD PTR [esp+36] + cmp DWORD PTR [esp+40], 16 + je L_AES_GCM_encrypt_final_aesni_store_tag_16 + xor ecx, ecx + movdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_final_aesni_store_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ecx], al + inc ecx + cmp ecx, DWORD PTR [esp+40] + jne L_AES_GCM_encrypt_final_aesni_store_tag_loop + jmp L_AES_GCM_encrypt_final_aesni_store_tag_done +L_AES_GCM_encrypt_final_aesni_store_tag_16: + movdqu OWORD PTR [edi], xmm0 +L_AES_GCM_encrypt_final_aesni_store_tag_done: + add esp, 16 + pop ebp + pop edi + pop esi + ret +AES_GCM_encrypt_final_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_update_aesni PROC + push ebx + push esi + push edi + push ebp + sub esp, 160 + mov esi, DWORD PTR [esp+208] + movdqa xmm4, OWORD PTR [esi] + movdqu OWORD PTR [esp+64], xmm4 + mov esi, DWORD PTR [esp+200] + mov ebp, DWORD PTR [esp+204] + movdqa xmm6, OWORD PTR [esi] + movdqa xmm5, OWORD PTR [ebp] + movdqu OWORD PTR [esp+80], xmm6 + mov ebp, DWORD PTR [esp+180] + mov edi, DWORD PTR [esp+188] + mov esi, DWORD PTR [esp+192] + movdqa xmm1, xmm5 + movdqa xmm0, xmm5 + psrlq xmm1, 63 + psllq xmm0, 1 + pslldq xmm1, 8 + por xmm0, xmm1 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+196], 64 + mov eax, DWORD PTR [esp+196] + jl L_AES_GCM_decrypt_update_aesni_done_64 + and eax, 4294967232 + movdqa xmm2, xmm6 + ; H ^ 1 + movdqu OWORD PTR [esp], xmm5 + ; H ^ 2 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm4, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm4, xmm1 + movdqu OWORD PTR [esp+16], xmm4 + ; H ^ 3 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm4, 78 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm4 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm7, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [esp+32], xmm7 + ; H ^ 4 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm4, 78 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm4 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm7, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm7, xmm1 + movdqu OWORD PTR [esp+48], xmm7 + cmp edi, esi + jne L_AES_GCM_decrypt_update_aesni_ghash_64 +L_AES_GCM_decrypt_update_aesni_ghash_64_inplace: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; Encrypt 64 bytes of counter + movdqu xmm0, OWORD PTR [esp+64] + movdqu xmm7, xmm0 + paddd xmm7, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm7 + movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pshufb xmm0, xmm7 + paddd xmm1, OWORD PTR L_aes_gcm_one + pshufb xmm1, xmm7 + paddd xmm2, OWORD PTR L_aes_gcm_two + pshufb xmm2, xmm7 + paddd xmm3, OWORD PTR L_aes_gcm_three + pshufb xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp] + pxor xmm0, xmm7 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+16] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+32] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+48] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+64] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+80] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+96] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+112] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+128] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+144] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+184], 11 + movdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+176] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+184], 13 + movdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+208] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done: + aesenclast xmm0, xmm7 + aesenclast xmm1, xmm7 + movdqu xmm4, OWORD PTR [ecx] + movdqu xmm5, OWORD PTR [ecx+16] + pxor xmm0, xmm4 + pxor xmm1, xmm5 + movdqu OWORD PTR [esp+96], xmm4 + movdqu OWORD PTR [esp+112], xmm5 + movdqu OWORD PTR [edx], xmm0 + movdqu OWORD PTR [edx+16], xmm1 + aesenclast xmm2, xmm7 + aesenclast xmm3, xmm7 + movdqu xmm4, OWORD PTR [ecx+32] + movdqu xmm5, OWORD PTR [ecx+48] + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [esp+128], xmm4 + movdqu OWORD PTR [esp+144], xmm5 + movdqu OWORD PTR [edx+32], xmm2 + movdqu OWORD PTR [edx+48], xmm3 + ; ghash encrypted counter + movdqu xmm2, OWORD PTR [esp+80] + movdqu xmm7, OWORD PTR [esp+48] + movdqu xmm0, OWORD PTR [esp+96] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm2 + pshufd xmm1, xmm7, 78 + pshufd xmm5, xmm0, 78 + pxor xmm1, xmm7 + pxor xmm5, xmm0 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 17 + movdqa xmm2, xmm0 + pclmulqdq xmm2, xmm7, 0 + pclmulqdq xmm1, xmm5, 0 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqu xmm7, OWORD PTR [esp+32] + movdqu xmm0, OWORD PTR [esp+112] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, OWORD PTR [esp+16] + movdqu xmm0, OWORD PTR [esp+128] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, OWORD PTR [esp] + movdqu xmm0, OWORD PTR [esp+144] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqa xmm5, xmm1 + psrldq xmm1, 8 + pslldq xmm5, 8 + pxor xmm2, xmm5 + pxor xmm3, xmm1 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + movdqa xmm5, xmm2 + pslld xmm7, 31 + pslld xmm4, 30 + pslld xmm5, 25 + pxor xmm7, xmm4 + pxor xmm7, xmm5 + movdqa xmm4, xmm7 + pslldq xmm7, 12 + psrldq xmm4, 4 + pxor xmm2, xmm7 + movdqa xmm5, xmm2 + movdqa xmm1, xmm2 + movdqa xmm0, xmm2 + psrld xmm5, 1 + psrld xmm1, 2 + psrld xmm0, 7 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm3 + movdqu OWORD PTR [esp+80], xmm2 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_aesni_ghash_64_inplace + jmp L_AES_GCM_decrypt_update_aesni_ghash_64_done +L_AES_GCM_decrypt_update_aesni_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; Encrypt 64 bytes of counter + movdqu xmm0, OWORD PTR [esp+64] + movdqu xmm7, xmm0 + paddd xmm7, OWORD PTR L_aes_gcm_four + movdqu OWORD PTR [esp+64], xmm7 + movdqa xmm7, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pshufb xmm0, xmm7 + paddd xmm1, OWORD PTR L_aes_gcm_one + pshufb xmm1, xmm7 + paddd xmm2, OWORD PTR L_aes_gcm_two + pshufb xmm2, xmm7 + paddd xmm3, OWORD PTR L_aes_gcm_three + pshufb xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp] + pxor xmm0, xmm7 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+16] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+32] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+48] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+64] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+80] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+96] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+112] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+128] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+144] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+184], 11 + movdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+176] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + cmp DWORD PTR [esp+184], 13 + movdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+208] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + movdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done: + aesenclast xmm0, xmm7 + aesenclast xmm1, xmm7 + movdqu xmm4, OWORD PTR [ecx] + movdqu xmm5, OWORD PTR [ecx+16] + pxor xmm0, xmm4 + pxor xmm1, xmm5 + movdqu OWORD PTR [edx], xmm0 + movdqu OWORD PTR [edx+16], xmm1 + aesenclast xmm2, xmm7 + aesenclast xmm3, xmm7 + movdqu xmm4, OWORD PTR [ecx+32] + movdqu xmm5, OWORD PTR [ecx+48] + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movdqu OWORD PTR [edx+32], xmm2 + movdqu OWORD PTR [edx+48], xmm3 + ; ghash encrypted counter + movdqu xmm2, OWORD PTR [esp+80] + movdqu xmm7, OWORD PTR [esp+48] + movdqu xmm0, OWORD PTR [ecx] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm2 + pshufd xmm1, xmm7, 78 + pshufd xmm5, xmm0, 78 + pxor xmm1, xmm7 + pxor xmm5, xmm0 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 17 + movdqa xmm2, xmm0 + pclmulqdq xmm2, xmm7, 0 + pclmulqdq xmm1, xmm5, 0 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqu xmm7, OWORD PTR [esp+32] + movdqu xmm0, OWORD PTR [ecx+16] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, OWORD PTR [esp+16] + movdqu xmm0, OWORD PTR [ecx+32] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, OWORD PTR [esp] + movdqu xmm0, OWORD PTR [ecx+48] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + pclmulqdq xmm7, xmm0, 0 + pclmulqdq xmm4, xmm5, 0 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqa xmm5, xmm1 + psrldq xmm1, 8 + pslldq xmm5, 8 + pxor xmm2, xmm5 + pxor xmm3, xmm1 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + movdqa xmm5, xmm2 + pslld xmm7, 31 + pslld xmm4, 30 + pslld xmm5, 25 + pxor xmm7, xmm4 + pxor xmm7, xmm5 + movdqa xmm4, xmm7 + pslldq xmm7, 12 + psrldq xmm4, 4 + pxor xmm2, xmm7 + movdqa xmm5, xmm2 + movdqa xmm1, xmm2 + movdqa xmm0, xmm2 + psrld xmm5, 1 + psrld xmm1, 2 + psrld xmm0, 7 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm3 + movdqu OWORD PTR [esp+80], xmm2 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_aesni_ghash_64 +L_AES_GCM_decrypt_update_aesni_ghash_64_done: + movdqa xmm6, xmm2 + movdqu xmm5, OWORD PTR [esp] +L_AES_GCM_decrypt_update_aesni_done_64: + mov edx, DWORD PTR [esp+196] + cmp ebx, edx + jge L_AES_GCM_decrypt_update_aesni_done_dec + mov eax, DWORD PTR [esp+196] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_decrypt_update_aesni_last_block_done +L_AES_GCM_decrypt_update_aesni_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + movdqu xmm1, OWORD PTR [ecx] + pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm1, xmm6 + movdqu OWORD PTR [esp], xmm1 + movdqu xmm0, OWORD PTR [esp+64] + movdqa xmm1, xmm0 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm1, OWORD PTR L_aes_gcm_one + pxor xmm0, [ebp] + movdqu OWORD PTR [esp+64], xmm1 + movdqu xmm4, OWORD PTR [esp] + pclmulqdq xmm4, xmm5, 16 + aesenc xmm0, [ebp+16] + aesenc xmm0, [ebp+32] + movdqu xmm7, OWORD PTR [esp] + pclmulqdq xmm7, xmm5, 1 + aesenc xmm0, [ebp+48] + aesenc xmm0, [ebp+64] + aesenc xmm0, [ebp+80] + movdqu xmm1, OWORD PTR [esp] + pclmulqdq xmm1, xmm5, 17 + aesenc xmm0, [ebp+96] + pxor xmm4, xmm7 + movdqa xmm2, xmm4 + psrldq xmm4, 8 + pslldq xmm2, 8 + aesenc xmm0, [ebp+112] + movdqu xmm7, OWORD PTR [esp] + pclmulqdq xmm7, xmm5, 0 + pxor xmm2, xmm7 + pxor xmm1, xmm4 + movdqa xmm3, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm7, xmm2 + pclmulqdq xmm7, xmm3, 16 + aesenc xmm0, [ebp+128] + pshufd xmm4, xmm2, 78 + pxor xmm4, xmm7 + movdqa xmm7, xmm4 + pclmulqdq xmm7, xmm3, 16 + aesenc xmm0, [ebp+144] + pshufd xmm6, xmm4, 78 + pxor xmm6, xmm7 + pxor xmm6, xmm1 + cmp DWORD PTR [esp+184], 11 + movdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+176] + cmp DWORD PTR [esp+184], 13 + movdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last + aesenc xmm0, xmm1 + aesenc xmm0, [ebp+208] + movdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last: + aesenclast xmm0, xmm1 + movdqu xmm1, OWORD PTR [ecx] + pxor xmm0, xmm1 + movdqu OWORD PTR [edx], xmm0 + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_aesni_last_block_start +L_AES_GCM_decrypt_update_aesni_last_block_done: +L_AES_GCM_decrypt_update_aesni_done_dec: + mov esi, DWORD PTR [esp+200] + mov edi, DWORD PTR [esp+208] + movdqu xmm4, OWORD PTR [esp+64] + movdqa OWORD PTR [esi], xmm6 + movdqu OWORD PTR [edi], xmm4 + add esp, 160 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_update_aesni ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_final_aesni PROC + push ebx + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+36] + mov esi, DWORD PTR [esp+56] + mov edi, DWORD PTR [esp+60] + movdqa xmm6, OWORD PTR [ebp] + movdqa xmm5, OWORD PTR [esi] + movdqa xmm7, OWORD PTR [edi] + movdqa xmm1, xmm5 + movdqa xmm0, xmm5 + psrlq xmm1, 63 + psllq xmm0, 1 + pslldq xmm1, 8 + por xmm0, xmm1 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm0 + mov edx, DWORD PTR [esp+48] + mov ecx, DWORD PTR [esp+52] + shl edx, 3 + shl ecx, 3 + pinsrd xmm0, edx, 0 + pinsrd xmm0, ecx, 2 + mov edx, DWORD PTR [esp+48] + mov ecx, DWORD PTR [esp+52] + shr edx, 29 + shr ecx, 29 + pinsrd xmm0, edx, 1 + pinsrd xmm0, ecx, 3 + pxor xmm6, xmm0 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm6, 78 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm0, xmm2 + pxor xmm6, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + pslld xmm1, 31 + pslld xmm2, 30 + pslld xmm3, 25 + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqa xmm3, xmm1 + psrldq xmm3, 4 + pslldq xmm1, 12 + pxor xmm0, xmm1 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psrld xmm1, 1 + psrld xmm2, 2 + pxor xmm1, xmm2 + pxor xmm1, xmm0 + psrld xmm0, 7 + pxor xmm1, xmm3 + pxor xmm1, xmm0 + pxor xmm6, xmm1 + pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm0, xmm7 + pxor xmm0, xmm6 + mov esi, DWORD PTR [esp+40] + mov edi, DWORD PTR [esp+64] + cmp DWORD PTR [esp+44], 16 + je L_AES_GCM_decrypt_final_aesni_cmp_tag_16 + sub esp, 16 + xor ecx, ecx + xor ebx, ebx + movdqu OWORD PTR [esp], xmm0 +L_AES_GCM_decrypt_final_aesni_cmp_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + xor al, BYTE PTR [esi+ecx] + or bl, al + inc ecx + cmp ecx, DWORD PTR [esp+44] + jne L_AES_GCM_decrypt_final_aesni_cmp_tag_loop + cmp bl, 0 + sete bl + add esp, 16 + xor ecx, ecx + jmp L_AES_GCM_decrypt_final_aesni_cmp_tag_done +L_AES_GCM_decrypt_final_aesni_cmp_tag_16: + movdqu xmm1, OWORD PTR [esi] + pcmpeqb xmm0, xmm1 + pmovmskb edx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_final_aesni_cmp_tag_done: + mov DWORD PTR [edi], ebx + add esp, 16 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_final_aesni ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX1 +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_avx1 PROC + push ebx + push esi + push edi + push ebp + sub esp, 112 + mov esi, DWORD PTR [esp+144] + mov ebp, DWORD PTR [esp+168] + mov edx, DWORD PTR [esp+160] + vpxor xmm0, xmm0, xmm0 + vpxor xmm2, xmm2, xmm2 + cmp edx, 12 + jne L_AES_GCM_encrypt_avx1_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vpinsrd xmm0, xmm0, DWORD PTR [esi], 0 + vpinsrd xmm0, xmm0, DWORD PTR [esi+4], 1 + vpinsrd xmm0, xmm0, DWORD PTR [esi+8], 2 + vpinsrd xmm0, xmm0, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm1, OWORD PTR [ebp] + vpxor xmm5, xmm0, xmm1 + vmovdqa xmm3, OWORD PTR [ebp+16] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+32] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+48] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+64] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+80] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+96] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+112] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+128] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+144] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_calc_iv_12_last + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+176] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_calc_iv_12_last + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+208] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_calc_iv_12_last: + vaesenclast xmm1, xmm1, xmm3 + vaesenclast xmm5, xmm5, xmm3 + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu OWORD PTR [esp+80], xmm5 + jmp L_AES_GCM_encrypt_avx1_iv_done +L_AES_GCM_encrypt_avx1_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm1, OWORD PTR [ebp] + vaesenc xmm1, xmm1, [ebp+16] + vaesenc xmm1, xmm1, [ebp+32] + vaesenc xmm1, xmm1, [ebp+48] + vaesenc xmm1, xmm1, [ebp+64] + vaesenc xmm1, xmm1, [ebp+80] + vaesenc xmm1, xmm1, [ebp+96] + vaesenc xmm1, xmm1, [ebp+112] + vaesenc xmm1, xmm1, [ebp+128] + vaesenc xmm1, xmm1, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm1, xmm1, xmm5 + vaesenc xmm1, xmm1, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm1, xmm1, xmm5 + vaesenc xmm1, xmm1, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast xmm1, xmm1, xmm5 + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_encrypt_avx1_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_avx1_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx1_calc_iv_16_loop: + vmovdqu xmm4, OWORD PTR [esi+ecx] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm0, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm0, 17 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpxor xmm5, xmm5, xmm0 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm0, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm0, xmm0, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm0, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm0, xmm0, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm0, xmm0, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm0, xmm0, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm0, xmm0, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_iv_16_loop + mov edx, DWORD PTR [esp+160] + cmp ecx, edx + je L_AES_GCM_encrypt_avx1_calc_iv_done +L_AES_GCM_encrypt_avx1_calc_iv_lt16: + sub esp, 16 + vpxor xmm4, xmm4, xmm4 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm4 +L_AES_GCM_encrypt_avx1_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_iv_loop + vmovdqu xmm4, OWORD PTR [esp] + add esp, 16 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm0, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm0, 17 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpxor xmm5, xmm5, xmm0 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm0, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm0, xmm0, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm0, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm0, xmm0, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm0, xmm0, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm0, xmm0, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm0, xmm0, xmm6 +L_AES_GCM_encrypt_avx1_calc_iv_done: + ; T = Encrypt counter + vpxor xmm4, xmm4, xmm4 + shl edx, 3 + vpinsrd xmm4, xmm4, edx, 0 + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm0, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm0, 17 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpxor xmm5, xmm5, xmm0 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm0, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm0, xmm0, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm0, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm0, xmm0, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm0, xmm0, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm0, xmm0, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm0, xmm0, xmm6 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + ; Encrypt counter + vmovdqa xmm4, OWORD PTR [ebp] + vpxor xmm4, xmm4, xmm0 + vaesenc xmm4, xmm4, [ebp+16] + vaesenc xmm4, xmm4, [ebp+32] + vaesenc xmm4, xmm4, [ebp+48] + vaesenc xmm4, xmm4, [ebp+64] + vaesenc xmm4, xmm4, [ebp+80] + vaesenc xmm4, xmm4, [ebp+96] + vaesenc xmm4, xmm4, [ebp+112] + vaesenc xmm4, xmm4, [ebp+128] + vaesenc xmm4, xmm4, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm5 + vmovdqu OWORD PTR [esp+80], xmm4 +L_AES_GCM_encrypt_avx1_iv_done: + mov esi, DWORD PTR [esp+140] + ; Additional authentication data + mov edx, DWORD PTR [esp+156] + cmp edx, 0 + je L_AES_GCM_encrypt_avx1_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_avx1_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx1_calc_aad_16_loop: + vmovdqu xmm4, OWORD PTR [esi+ecx] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm2, xmm2, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm2, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vpclmulqdq xmm4, xmm1, xmm2, 0 + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm2, xmm2, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm2, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm2, xmm2, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm2, xmm2, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm2, xmm2, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_aad_16_loop + mov edx, DWORD PTR [esp+156] + cmp ecx, edx + je L_AES_GCM_encrypt_avx1_calc_aad_done +L_AES_GCM_encrypt_avx1_calc_aad_lt16: + sub esp, 16 + vpxor xmm4, xmm4, xmm4 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm4 +L_AES_GCM_encrypt_avx1_calc_aad_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_aad_loop + vmovdqu xmm4, OWORD PTR [esp] + add esp, 16 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm2, xmm2, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm2, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vpclmulqdq xmm4, xmm1, xmm2, 0 + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm2, xmm2, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm2, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm2, xmm2, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm2, xmm2, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm2, xmm2, xmm6 +L_AES_GCM_encrypt_avx1_calc_aad_done: + vmovdqu OWORD PTR [esp+96], xmm2 + mov esi, DWORD PTR [esp+132] + mov edi, DWORD PTR [esp+136] + ; Calculate counter and H + vpsrlq xmm5, xmm1, 63 + vpsllq xmm4, xmm1, 1 + vpslldq xmm5, xmm5, 8 + vpor xmm4, xmm4, xmm5 + vpshufd xmm1, xmm1, 255 + vpsrad xmm1, xmm1, 31 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpand xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpaddd xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_one + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+64], xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+152], 64 + mov eax, DWORD PTR [esp+152] + jl L_AES_GCM_encrypt_avx1_done_64 + and eax, 4294967232 + vmovdqa xmm6, xmm2 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm1 + ; H ^ 2 + vpclmulqdq xmm4, xmm1, xmm1, 0 + vpclmulqdq xmm0, xmm1, xmm1, 17 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm0, xmm0, xmm5 + vmovdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm0, 78 + vpclmulqdq xmm7, xmm0, xmm1, 17 + vpclmulqdq xmm4, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm0 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm6 + vpxor xmm3, xmm7, xmm5 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [esp+32], xmm3 + ; H ^ 4 + vpclmulqdq xmm4, xmm0, xmm0, 0 + vpclmulqdq xmm3, xmm0, xmm0, 17 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [esp+48], xmm3 + ; First 64 bytes of input + vmovdqu xmm4, OWORD PTR [esp+64] + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm3 + vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm5, xmm5, xmm3 + vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm6, xmm6, xmm3 + vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm7, xmm7, xmm3 + vpshufb xmm4, xmm4, xmm3 + vmovdqa xmm3, OWORD PTR [ebp] + vpxor xmm4, xmm4, xmm3 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+16] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+32] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+48] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+64] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+80] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+96] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+112] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+128] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+144] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+176] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+208] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_aesenc_64_enc_done: + vaesenclast xmm4, xmm4, xmm3 + vaesenclast xmm5, xmm5, xmm3 + vmovdqu xmm0, OWORD PTR [esi] + vmovdqu xmm1, OWORD PTR [esi+16] + vpxor xmm4, xmm4, xmm0 + vpxor xmm5, xmm5, xmm1 + vmovdqu OWORD PTR [edi], xmm4 + vmovdqu OWORD PTR [edi+16], xmm5 + vaesenclast xmm6, xmm6, xmm3 + vaesenclast xmm7, xmm7, xmm3 + vmovdqu xmm0, OWORD PTR [esi+32] + vmovdqu xmm1, OWORD PTR [esi+48] + vpxor xmm6, xmm6, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [edi+32], xmm6 + vmovdqu OWORD PTR [edi+48], xmm7 + cmp eax, 64 + mov ebx, 64 + mov ecx, esi + mov edx, edi + jle L_AES_GCM_encrypt_avx1_end_64 + ; More 64 bytes of input +L_AES_GCM_encrypt_avx1_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm4, OWORD PTR [esp+64] + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm3 + vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm5, xmm5, xmm3 + vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm6, xmm6, xmm3 + vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm7, xmm7, xmm3 + vpshufb xmm4, xmm4, xmm3 + vmovdqa xmm3, OWORD PTR [ebp] + vpxor xmm4, xmm4, xmm3 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+16] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+32] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+48] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+64] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+80] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+96] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+112] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+128] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+144] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+176] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+208] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast xmm4, xmm4, xmm3 + vaesenclast xmm5, xmm5, xmm3 + vmovdqu xmm0, OWORD PTR [ecx] + vmovdqu xmm1, OWORD PTR [ecx+16] + vpxor xmm4, xmm4, xmm0 + vpxor xmm5, xmm5, xmm1 + vmovdqu OWORD PTR [edx], xmm4 + vmovdqu OWORD PTR [edx+16], xmm5 + vaesenclast xmm6, xmm6, xmm3 + vaesenclast xmm7, xmm7, xmm3 + vmovdqu xmm0, OWORD PTR [ecx+32] + vmovdqu xmm1, OWORD PTR [ecx+48] + vpxor xmm6, xmm6, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [edx+32], xmm6 + vmovdqu OWORD PTR [edx+48], xmm7 + ; ghash encrypted counter + vmovdqu xmm6, OWORD PTR [esp+96] + vmovdqu xmm3, OWORD PTR [esp+48] + vmovdqu xmm4, OWORD PTR [edx+-64] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm6 + vpshufd xmm5, xmm3, 78 + vpshufd xmm1, xmm4, 78 + vpxor xmm5, xmm5, xmm3 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm7, xmm4, xmm3, 17 + vpclmulqdq xmm6, xmm4, xmm3, 0 + vpclmulqdq xmm5, xmm5, xmm1, 0 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vmovdqu xmm3, OWORD PTR [esp+32] + vmovdqu xmm4, OWORD PTR [edx+-48] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vmovdqu xmm3, OWORD PTR [esp+16] + vmovdqu xmm4, OWORD PTR [edx+-32] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vmovdqu xmm3, OWORD PTR [esp] + vmovdqu xmm4, OWORD PTR [edx+-16] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpslld xmm3, xmm6, 31 + vpslld xmm0, xmm6, 30 + vpslld xmm1, xmm6, 25 + vpxor xmm3, xmm3, xmm0 + vpxor xmm3, xmm3, xmm1 + vpsrldq xmm0, xmm3, 4 + vpslldq xmm3, xmm3, 12 + vpxor xmm6, xmm6, xmm3 + vpsrld xmm1, xmm6, 1 + vpsrld xmm5, xmm6, 2 + vpsrld xmm4, xmm6, 7 + vpxor xmm1, xmm1, xmm5 + vpxor xmm1, xmm1, xmm4 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm7 + vmovdqu OWORD PTR [esp+96], xmm6 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_encrypt_avx1_ghash_64 +L_AES_GCM_encrypt_avx1_end_64: + vmovdqu xmm2, OWORD PTR [esp+96] + ; Block 1 + vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm1, OWORD PTR [edx] + vpshufb xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [esp+48] + vpxor xmm1, xmm1, xmm2 + ; ghash_gfmul_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm3, 78 + vpclmulqdq xmm7, xmm3, xmm1, 17 + vpclmulqdq xmm4, xmm3, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm0, xmm4 + vmovdqa xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm0, xmm0, xmm6 + vpxor xmm2, xmm2, xmm5 + ; Block 2 + vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm1, OWORD PTR [edx+16] + vpshufb xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [esp+32] + ; ghash_gfmul_xor_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm3, 78 + vpclmulqdq xmm7, xmm3, xmm1, 17 + vpclmulqdq xmm4, xmm3, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpxor xmm0, xmm0, xmm4 + vpxor xmm2, xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm0, xmm0, xmm6 + vpxor xmm2, xmm2, xmm5 + ; Block 3 + vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm1, OWORD PTR [edx+32] + vpshufb xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [esp+16] + ; ghash_gfmul_xor_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm3, 78 + vpclmulqdq xmm7, xmm3, xmm1, 17 + vpclmulqdq xmm4, xmm3, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpxor xmm0, xmm0, xmm4 + vpxor xmm2, xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm0, xmm0, xmm6 + vpxor xmm2, xmm2, xmm5 + ; Block 4 + vmovdqa xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm1, OWORD PTR [edx+48] + vpshufb xmm1, xmm1, xmm4 + vmovdqu xmm3, OWORD PTR [esp] + ; ghash_gfmul_xor_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm3, 78 + vpclmulqdq xmm7, xmm3, xmm1, 17 + vpclmulqdq xmm4, xmm3, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpxor xmm0, xmm0, xmm4 + vpxor xmm2, xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm0, xmm0, xmm6 + vpxor xmm2, xmm2, xmm5 + vpslld xmm4, xmm0, 31 + vpslld xmm5, xmm0, 30 + vpslld xmm6, xmm0, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm0, xmm0, xmm4 + vpsrld xmm6, xmm0, 1 + vpsrld xmm7, xmm0, 2 + vpsrld xmm4, xmm0, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm0 + vpxor xmm2, xmm2, xmm6 + vmovdqu xmm1, OWORD PTR [esp] +L_AES_GCM_encrypt_avx1_done_64: + mov edx, DWORD PTR [esp+152] + cmp ebx, edx + jge L_AES_GCM_encrypt_avx1_done_enc + mov eax, DWORD PTR [esp+152] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_encrypt_avx1_last_block_done + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm5, OWORD PTR [esp+64] + vpshufb xmm4, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_one + vmovdqu OWORD PTR [esp+64], xmm5 + vpxor xmm4, xmm4, [ebp] + vaesenc xmm4, xmm4, [ebp+16] + vaesenc xmm4, xmm4, [ebp+32] + vaesenc xmm4, xmm4, [ebp+48] + vaesenc xmm4, xmm4, [ebp+64] + vaesenc xmm4, xmm4, [ebp+80] + vaesenc xmm4, xmm4, [ebp+96] + vaesenc xmm4, xmm4, [ebp+112] + vaesenc xmm4, xmm4, [ebp+128] + vaesenc xmm4, xmm4, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm5 + vmovdqu xmm5, OWORD PTR [ecx] + vpxor xmm4, xmm4, xmm5 + vmovdqu OWORD PTR [edx], xmm4 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm2, xmm2, xmm4 + add ebx, 16 + cmp ebx, eax + jge L_AES_GCM_encrypt_avx1_last_block_ghash +L_AES_GCM_encrypt_avx1_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm5, OWORD PTR [esp+64] + vmovdqu xmm7, xmm2 + vpshufb xmm4, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_one + vmovdqu OWORD PTR [esp+64], xmm5 + vpxor xmm4, xmm4, [ebp] + vpclmulqdq xmm0, xmm7, xmm1, 16 + vaesenc xmm4, xmm4, [ebp+16] + vaesenc xmm4, xmm4, [ebp+32] + vpclmulqdq xmm3, xmm7, xmm1, 1 + vaesenc xmm4, xmm4, [ebp+48] + vaesenc xmm4, xmm4, [ebp+64] + vaesenc xmm4, xmm4, [ebp+80] + vpclmulqdq xmm5, xmm7, xmm1, 17 + vaesenc xmm4, xmm4, [ebp+96] + vpxor xmm0, xmm0, xmm3 + vpslldq xmm6, xmm0, 8 + vpsrldq xmm0, xmm0, 8 + vaesenc xmm4, xmm4, [ebp+112] + vpclmulqdq xmm3, xmm7, xmm1, 0 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm0 + vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpclmulqdq xmm3, xmm6, xmm7, 16 + vaesenc xmm4, xmm4, [ebp+128] + vpshufd xmm0, xmm6, 78 + vpxor xmm0, xmm0, xmm3 + vpclmulqdq xmm3, xmm0, xmm7, 16 + vaesenc xmm4, xmm4, [ebp+144] + vpshufd xmm2, xmm0, 78 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm5 + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_aesenc_gfmul_last: + vaesenclast xmm4, xmm4, xmm5 + vmovdqu xmm5, OWORD PTR [ecx] + vpxor xmm4, xmm4, xmm5 + vmovdqu OWORD PTR [edx], xmm4 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + add ebx, 16 + vpxor xmm2, xmm2, xmm4 + cmp ebx, eax + jl L_AES_GCM_encrypt_avx1_last_block_start +L_AES_GCM_encrypt_avx1_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm2, 78 + vpclmulqdq xmm7, xmm2, xmm1, 17 + vpclmulqdq xmm4, xmm2, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm2 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm6 + vpxor xmm2, xmm7, xmm5 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 +L_AES_GCM_encrypt_avx1_last_block_done: + mov ecx, DWORD PTR [esp+152] + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done + vmovdqu xmm0, OWORD PTR [esp+64] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpxor xmm0, xmm0, [ebp] + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vaesenc xmm0, xmm0, [ebp+96] + vaesenc xmm0, xmm0, [ebp+112] + vaesenc xmm0, xmm0, [ebp+128] + vaesenc xmm0, xmm0, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm0, xmm0, xmm5 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm0, xmm0, xmm5 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm5 + sub esp, 16 + xor ecx, ecx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop: + movzx eax, BYTE PTR [esi+ebx] + xor al, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ebx], al + mov BYTE PTR [esp+ecx], al + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop + xor eax, eax + cmp ecx, 16 + je L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [esp+ecx], al + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm0, OWORD PTR [esp] + add esp, 16 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm2, xmm2, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm2, 78 + vpclmulqdq xmm7, xmm2, xmm1, 17 + vpclmulqdq xmm4, xmm2, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm2 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm6 + vpxor xmm2, xmm7, xmm5 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_avx1_done_enc: + mov edi, DWORD PTR [esp+148] + mov ebx, DWORD PTR [esp+164] + mov edx, DWORD PTR [esp+152] + mov ecx, DWORD PTR [esp+156] + shl edx, 3 + shl ecx, 3 + vpinsrd xmm4, xmm4, edx, 0 + vpinsrd xmm4, xmm4, ecx, 2 + mov edx, DWORD PTR [esp+152] + mov ecx, DWORD PTR [esp+156] + shr edx, 29 + shr ecx, 29 + vpinsrd xmm4, xmm4, edx, 1 + vpinsrd xmm4, xmm4, ecx, 3 + vpxor xmm2, xmm2, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm2, 78 + vpclmulqdq xmm7, xmm2, xmm1, 17 + vpclmulqdq xmm4, xmm2, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm2 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm6 + vpxor xmm2, xmm7, xmm5 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpshufb xmm2, xmm2, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm2, [esp+80] + cmp ebx, 16 + je L_AES_GCM_encrypt_avx1_store_tag_16 + xor ecx, ecx + vmovdqu OWORD PTR [esp], xmm4 +L_AES_GCM_encrypt_avx1_store_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ecx], al + inc ecx + cmp ecx, ebx + jne L_AES_GCM_encrypt_avx1_store_tag_loop + jmp L_AES_GCM_encrypt_avx1_store_tag_done +L_AES_GCM_encrypt_avx1_store_tag_16: + vmovdqu OWORD PTR [edi], xmm4 +L_AES_GCM_encrypt_avx1_store_tag_done: + add esp, 112 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_encrypt_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_avx1 PROC + push ebx + push esi + push edi + push ebp + sub esp, 176 + mov esi, DWORD PTR [esp+208] + mov ebp, DWORD PTR [esp+232] + mov edx, DWORD PTR [esp+224] + vpxor xmm0, xmm0, xmm0 + vpxor xmm2, xmm2, xmm2 + cmp edx, 12 + jne L_AES_GCM_decrypt_avx1_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vpinsrd xmm0, xmm0, DWORD PTR [esi], 0 + vpinsrd xmm0, xmm0, DWORD PTR [esi+4], 1 + vpinsrd xmm0, xmm0, DWORD PTR [esi+8], 2 + vpinsrd xmm0, xmm0, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm1, OWORD PTR [ebp] + vpxor xmm5, xmm0, xmm1 + vmovdqa xmm3, OWORD PTR [ebp+16] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+32] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+48] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+64] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+80] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+96] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+112] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+128] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+144] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + cmp DWORD PTR [esp+236], 11 + vmovdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx1_calc_iv_12_last + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+176] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + cmp DWORD PTR [esp+236], 13 + vmovdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx1_calc_iv_12_last + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+208] + vaesenc xmm1, xmm1, xmm3 + vaesenc xmm5, xmm5, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx1_calc_iv_12_last: + vaesenclast xmm1, xmm1, xmm3 + vaesenclast xmm5, xmm5, xmm3 + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu OWORD PTR [esp+80], xmm5 + jmp L_AES_GCM_decrypt_avx1_iv_done +L_AES_GCM_decrypt_avx1_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm1, OWORD PTR [ebp] + vaesenc xmm1, xmm1, [ebp+16] + vaesenc xmm1, xmm1, [ebp+32] + vaesenc xmm1, xmm1, [ebp+48] + vaesenc xmm1, xmm1, [ebp+64] + vaesenc xmm1, xmm1, [ebp+80] + vaesenc xmm1, xmm1, [ebp+96] + vaesenc xmm1, xmm1, [ebp+112] + vaesenc xmm1, xmm1, [ebp+128] + vaesenc xmm1, xmm1, [ebp+144] + cmp DWORD PTR [esp+236], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm1, xmm1, xmm5 + vaesenc xmm1, xmm1, [ebp+176] + cmp DWORD PTR [esp+236], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm1, xmm1, xmm5 + vaesenc xmm1, xmm1, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast xmm1, xmm1, xmm5 + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_decrypt_avx1_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_avx1_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx1_calc_iv_16_loop: + vmovdqu xmm4, OWORD PTR [esi+ecx] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm0, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm0, 17 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpxor xmm5, xmm5, xmm0 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm0, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm0, xmm0, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm0, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm0, xmm0, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm0, xmm0, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm0, xmm0, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm0, xmm0, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_iv_16_loop + mov edx, DWORD PTR [esp+224] + cmp ecx, edx + je L_AES_GCM_decrypt_avx1_calc_iv_done +L_AES_GCM_decrypt_avx1_calc_iv_lt16: + sub esp, 16 + vpxor xmm4, xmm4, xmm4 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm4 +L_AES_GCM_decrypt_avx1_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_iv_loop + vmovdqu xmm4, OWORD PTR [esp] + add esp, 16 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm0, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm0, 17 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpxor xmm5, xmm5, xmm0 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm0, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm0, xmm0, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm0, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm0, xmm0, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm0, xmm0, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm0, xmm0, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm0, xmm0, xmm6 +L_AES_GCM_decrypt_avx1_calc_iv_done: + ; T = Encrypt counter + vpxor xmm4, xmm4, xmm4 + shl edx, 3 + vpinsrd xmm4, xmm4, edx, 0 + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm0, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm0, 17 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpxor xmm5, xmm5, xmm0 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm0, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm0, xmm0, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm0, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm0, xmm0, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm0, xmm0, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm0, xmm0, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm0, xmm0, xmm6 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + ; Encrypt counter + vmovdqa xmm4, OWORD PTR [ebp] + vpxor xmm4, xmm4, xmm0 + vaesenc xmm4, xmm4, [ebp+16] + vaesenc xmm4, xmm4, [ebp+32] + vaesenc xmm4, xmm4, [ebp+48] + vaesenc xmm4, xmm4, [ebp+64] + vaesenc xmm4, xmm4, [ebp+80] + vaesenc xmm4, xmm4, [ebp+96] + vaesenc xmm4, xmm4, [ebp+112] + vaesenc xmm4, xmm4, [ebp+128] + vaesenc xmm4, xmm4, [ebp+144] + cmp DWORD PTR [esp+236], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+176] + cmp DWORD PTR [esp+236], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm5 + vmovdqu OWORD PTR [esp+80], xmm4 +L_AES_GCM_decrypt_avx1_iv_done: + mov esi, DWORD PTR [esp+204] + ; Additional authentication data + mov edx, DWORD PTR [esp+220] + cmp edx, 0 + je L_AES_GCM_decrypt_avx1_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_avx1_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx1_calc_aad_16_loop: + vmovdqu xmm4, OWORD PTR [esi+ecx] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm2, xmm2, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm2, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vpclmulqdq xmm4, xmm1, xmm2, 0 + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm2, xmm2, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm2, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm2, xmm2, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm2, xmm2, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm2, xmm2, xmm6 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_aad_16_loop + mov edx, DWORD PTR [esp+220] + cmp ecx, edx + je L_AES_GCM_decrypt_avx1_calc_aad_done +L_AES_GCM_decrypt_avx1_calc_aad_lt16: + sub esp, 16 + vpxor xmm4, xmm4, xmm4 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm4 +L_AES_GCM_decrypt_avx1_calc_aad_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_aad_loop + vmovdqu xmm4, OWORD PTR [esp] + add esp, 16 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm2, xmm2, xmm4 + ; ghash_gfmul_avx + vpshufd xmm5, xmm2, 78 + vpshufd xmm6, xmm1, 78 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vpclmulqdq xmm4, xmm1, xmm2, 0 + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm1 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vmovdqa xmm3, xmm4 + vmovdqa xmm2, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm3, xmm3, xmm6 + vpxor xmm2, xmm2, xmm5 + vpsrld xmm4, xmm3, 31 + vpsrld xmm5, xmm2, 31 + vpslld xmm3, xmm3, 1 + vpslld xmm2, xmm2, 1 + vpsrldq xmm6, xmm4, 12 + vpslldq xmm4, xmm4, 4 + vpslldq xmm5, xmm5, 4 + vpor xmm2, xmm2, xmm6 + vpor xmm3, xmm3, xmm4 + vpor xmm2, xmm2, xmm5 + vpslld xmm4, xmm3, 31 + vpslld xmm5, xmm3, 30 + vpslld xmm6, xmm3, 25 + vpxor xmm4, xmm4, xmm5 + vpxor xmm4, xmm4, xmm6 + vmovdqa xmm5, xmm4 + vpsrldq xmm5, xmm5, 4 + vpslldq xmm4, xmm4, 12 + vpxor xmm3, xmm3, xmm4 + vpsrld xmm6, xmm3, 1 + vpsrld xmm7, xmm3, 2 + vpsrld xmm4, xmm3, 7 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm4 + vpxor xmm6, xmm6, xmm5 + vpxor xmm6, xmm6, xmm3 + vpxor xmm2, xmm2, xmm6 +L_AES_GCM_decrypt_avx1_calc_aad_done: + vmovdqu OWORD PTR [esp+96], xmm2 + mov esi, DWORD PTR [esp+196] + mov edi, DWORD PTR [esp+200] + ; Calculate counter and H + vpsrlq xmm5, xmm1, 63 + vpsllq xmm4, xmm1, 1 + vpslldq xmm5, xmm5, 8 + vpor xmm4, xmm4, xmm5 + vpshufd xmm1, xmm1, 255 + vpsrad xmm1, xmm1, 31 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpand xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpaddd xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_one + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+64], xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+216], 64 + mov eax, DWORD PTR [esp+216] + jl L_AES_GCM_decrypt_avx1_done_64 + and eax, 4294967232 + vmovdqa xmm6, xmm2 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm1 + ; H ^ 2 + vpclmulqdq xmm4, xmm1, xmm1, 0 + vpclmulqdq xmm0, xmm1, xmm1, 17 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm0, xmm0, xmm5 + vmovdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm0, 78 + vpclmulqdq xmm7, xmm0, xmm1, 17 + vpclmulqdq xmm4, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm0 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm6 + vpxor xmm3, xmm7, xmm5 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [esp+32], xmm3 + ; H ^ 4 + vpclmulqdq xmm4, xmm0, xmm0, 0 + vpclmulqdq xmm3, xmm0, xmm0, 17 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [esp+48], xmm3 + cmp edi, esi + jne L_AES_GCM_decrypt_avx1_ghash_64 +L_AES_GCM_decrypt_avx1_ghash_64_inplace: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm4, OWORD PTR [esp+64] + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm3 + vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm5, xmm5, xmm3 + vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm6, xmm6, xmm3 + vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm7, xmm7, xmm3 + vpshufb xmm4, xmm4, xmm3 + vmovdqa xmm3, OWORD PTR [ebp] + vpxor xmm4, xmm4, xmm3 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+16] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+32] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+48] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+64] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+80] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+96] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+112] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+128] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+144] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+236], 11 + vmovdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+176] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+236], 13 + vmovdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+208] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast xmm4, xmm4, xmm3 + vaesenclast xmm5, xmm5, xmm3 + vmovdqu xmm0, OWORD PTR [ecx] + vmovdqu xmm1, OWORD PTR [ecx+16] + vpxor xmm4, xmm4, xmm0 + vpxor xmm5, xmm5, xmm1 + vmovdqu OWORD PTR [esp+112], xmm0 + vmovdqu OWORD PTR [esp+128], xmm1 + vmovdqu OWORD PTR [edx], xmm4 + vmovdqu OWORD PTR [edx+16], xmm5 + vaesenclast xmm6, xmm6, xmm3 + vaesenclast xmm7, xmm7, xmm3 + vmovdqu xmm0, OWORD PTR [ecx+32] + vmovdqu xmm1, OWORD PTR [ecx+48] + vpxor xmm6, xmm6, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [esp+144], xmm0 + vmovdqu OWORD PTR [esp+160], xmm1 + vmovdqu OWORD PTR [edx+32], xmm6 + vmovdqu OWORD PTR [edx+48], xmm7 + ; ghash encrypted counter + vmovdqu xmm6, OWORD PTR [esp+96] + vmovdqu xmm3, OWORD PTR [esp+48] + vmovdqu xmm4, OWORD PTR [esp+112] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm6 + vpshufd xmm5, xmm3, 78 + vpshufd xmm1, xmm4, 78 + vpxor xmm5, xmm5, xmm3 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm7, xmm4, xmm3, 17 + vpclmulqdq xmm6, xmm4, xmm3, 0 + vpclmulqdq xmm5, xmm5, xmm1, 0 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vmovdqu xmm3, OWORD PTR [esp+32] + vmovdqu xmm4, OWORD PTR [esp+128] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vmovdqu xmm3, OWORD PTR [esp+16] + vmovdqu xmm4, OWORD PTR [esp+144] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vmovdqu xmm3, OWORD PTR [esp] + vmovdqu xmm4, OWORD PTR [esp+160] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpslld xmm3, xmm6, 31 + vpslld xmm0, xmm6, 30 + vpslld xmm1, xmm6, 25 + vpxor xmm3, xmm3, xmm0 + vpxor xmm3, xmm3, xmm1 + vpsrldq xmm0, xmm3, 4 + vpslldq xmm3, xmm3, 12 + vpxor xmm6, xmm6, xmm3 + vpsrld xmm1, xmm6, 1 + vpsrld xmm5, xmm6, 2 + vpsrld xmm4, xmm6, 7 + vpxor xmm1, xmm1, xmm5 + vpxor xmm1, xmm1, xmm4 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm7 + vmovdqu OWORD PTR [esp+96], xmm6 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_avx1_ghash_64_inplace + jmp L_AES_GCM_decrypt_avx1_ghash_64_done +L_AES_GCM_decrypt_avx1_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm4, OWORD PTR [esp+64] + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm3 + vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm5, xmm4, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm5, xmm5, xmm3 + vpaddd xmm6, xmm4, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm6, xmm6, xmm3 + vpaddd xmm7, xmm4, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm7, xmm7, xmm3 + vpshufb xmm4, xmm4, xmm3 + vmovdqa xmm3, OWORD PTR [ebp] + vpxor xmm4, xmm4, xmm3 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+16] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+32] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+48] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+64] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+80] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+96] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+112] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+128] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+144] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+236], 11 + vmovdqa xmm3, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+176] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + cmp DWORD PTR [esp+236], 13 + vmovdqa xmm3, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+208] + vaesenc xmm4, xmm4, xmm3 + vaesenc xmm5, xmm5, xmm3 + vaesenc xmm6, xmm6, xmm3 + vaesenc xmm7, xmm7, xmm3 + vmovdqa xmm3, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast xmm4, xmm4, xmm3 + vaesenclast xmm5, xmm5, xmm3 + vmovdqu xmm0, OWORD PTR [ecx] + vmovdqu xmm1, OWORD PTR [ecx+16] + vpxor xmm4, xmm4, xmm0 + vpxor xmm5, xmm5, xmm1 + vmovdqu OWORD PTR [edx], xmm4 + vmovdqu OWORD PTR [edx+16], xmm5 + vaesenclast xmm6, xmm6, xmm3 + vaesenclast xmm7, xmm7, xmm3 + vmovdqu xmm0, OWORD PTR [ecx+32] + vmovdqu xmm1, OWORD PTR [ecx+48] + vpxor xmm6, xmm6, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [edx+32], xmm6 + vmovdqu OWORD PTR [edx+48], xmm7 + ; ghash encrypted counter + vmovdqu xmm6, OWORD PTR [esp+96] + vmovdqu xmm3, OWORD PTR [esp+48] + vmovdqu xmm4, OWORD PTR [ecx] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm6 + vpshufd xmm5, xmm3, 78 + vpshufd xmm1, xmm4, 78 + vpxor xmm5, xmm5, xmm3 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm7, xmm4, xmm3, 17 + vpclmulqdq xmm6, xmm4, xmm3, 0 + vpclmulqdq xmm5, xmm5, xmm1, 0 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vmovdqu xmm3, OWORD PTR [esp+32] + vmovdqu xmm4, OWORD PTR [ecx+16] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vmovdqu xmm3, OWORD PTR [esp+16] + vmovdqu xmm4, OWORD PTR [ecx+32] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vmovdqu xmm3, OWORD PTR [esp] + vmovdqu xmm4, OWORD PTR [ecx+48] + vpshufd xmm0, xmm3, 78 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm3 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm4 + vpclmulqdq xmm2, xmm4, xmm3, 17 + vpclmulqdq xmm3, xmm4, xmm3, 0 + vpclmulqdq xmm0, xmm0, xmm1, 0 + vpxor xmm5, xmm5, xmm3 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm7, xmm7, xmm2 + vpxor xmm5, xmm5, xmm0 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpslld xmm3, xmm6, 31 + vpslld xmm0, xmm6, 30 + vpslld xmm1, xmm6, 25 + vpxor xmm3, xmm3, xmm0 + vpxor xmm3, xmm3, xmm1 + vpsrldq xmm0, xmm3, 4 + vpslldq xmm3, xmm3, 12 + vpxor xmm6, xmm6, xmm3 + vpsrld xmm1, xmm6, 1 + vpsrld xmm5, xmm6, 2 + vpsrld xmm4, xmm6, 7 + vpxor xmm1, xmm1, xmm5 + vpxor xmm1, xmm1, xmm4 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm7 + vmovdqu OWORD PTR [esp+96], xmm6 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_avx1_ghash_64 +L_AES_GCM_decrypt_avx1_ghash_64_done: + vmovdqa xmm2, xmm6 + vmovdqu xmm1, OWORD PTR [esp] +L_AES_GCM_decrypt_avx1_done_64: + mov edx, DWORD PTR [esp+216] + cmp ebx, edx + jge L_AES_GCM_decrypt_avx1_done_dec + mov eax, DWORD PTR [esp+216] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_decrypt_avx1_last_block_done +L_AES_GCM_decrypt_avx1_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm7, OWORD PTR [ecx] + pshufb xmm7, OWORD PTR L_aes_gcm_avx1_bswap_mask + pxor xmm7, xmm2 + vmovdqu xmm5, OWORD PTR [esp+64] + vpshufb xmm4, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_one + vmovdqu OWORD PTR [esp+64], xmm5 + vpxor xmm4, xmm4, [ebp] + vpclmulqdq xmm0, xmm7, xmm1, 16 + vaesenc xmm4, xmm4, [ebp+16] + vaesenc xmm4, xmm4, [ebp+32] + vpclmulqdq xmm3, xmm7, xmm1, 1 + vaesenc xmm4, xmm4, [ebp+48] + vaesenc xmm4, xmm4, [ebp+64] + vaesenc xmm4, xmm4, [ebp+80] + vpclmulqdq xmm5, xmm7, xmm1, 17 + vaesenc xmm4, xmm4, [ebp+96] + vpxor xmm0, xmm0, xmm3 + vpslldq xmm6, xmm0, 8 + vpsrldq xmm0, xmm0, 8 + vaesenc xmm4, xmm4, [ebp+112] + vpclmulqdq xmm3, xmm7, xmm1, 0 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm0 + vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpclmulqdq xmm3, xmm6, xmm7, 16 + vaesenc xmm4, xmm4, [ebp+128] + vpshufd xmm0, xmm6, 78 + vpxor xmm0, xmm0, xmm3 + vpclmulqdq xmm3, xmm0, xmm7, 16 + vaesenc xmm4, xmm4, [ebp+144] + vpshufd xmm2, xmm0, 78 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm5 + cmp DWORD PTR [esp+236], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+176] + cmp DWORD PTR [esp+236], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last + vaesenc xmm4, xmm4, xmm5 + vaesenc xmm4, xmm4, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx1_aesenc_gfmul_last: + vaesenclast xmm4, xmm4, xmm5 + vmovdqu xmm5, OWORD PTR [ecx] + vpxor xmm4, xmm4, xmm5 + vmovdqu OWORD PTR [edx], xmm4 + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_decrypt_avx1_last_block_start +L_AES_GCM_decrypt_avx1_last_block_done: + mov ecx, DWORD PTR [esp+216] + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done + vmovdqu xmm0, OWORD PTR [esp+64] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpxor xmm0, xmm0, [ebp] + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vaesenc xmm0, xmm0, [ebp+96] + vaesenc xmm0, xmm0, [ebp+112] + vaesenc xmm0, xmm0, [ebp+128] + vaesenc xmm0, xmm0, [ebp+144] + cmp DWORD PTR [esp+236], 11 + vmovdqa xmm5, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm0, xmm0, xmm5 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+236], 13 + vmovdqa xmm5, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm0, xmm0, xmm5 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqa xmm5, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm5 + sub esp, 32 + xor ecx, ecx + vmovdqu OWORD PTR [esp], xmm0 + vpxor xmm4, xmm4, xmm4 + vmovdqu OWORD PTR [esp+16], xmm4 +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop: + movzx eax, BYTE PTR [esi+ebx] + mov BYTE PTR [esp+ecx+16], al + xor al, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ebx], al + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop + vmovdqu xmm0, OWORD PTR [esp+16] + add esp, 32 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm2, xmm2, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm2, 78 + vpclmulqdq xmm7, xmm2, xmm1, 17 + vpclmulqdq xmm4, xmm2, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm2 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm6 + vpxor xmm2, xmm7, xmm5 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_avx1_done_dec: + mov esi, DWORD PTR [esp+212] + mov ebp, DWORD PTR [esp+228] + mov edx, DWORD PTR [esp+216] + mov ecx, DWORD PTR [esp+220] + shl edx, 3 + shl ecx, 3 + vpinsrd xmm4, xmm4, edx, 0 + vpinsrd xmm4, xmm4, ecx, 2 + mov edx, DWORD PTR [esp+216] + mov ecx, DWORD PTR [esp+220] + shr edx, 29 + shr ecx, 29 + vpinsrd xmm4, xmm4, edx, 1 + vpinsrd xmm4, xmm4, ecx, 3 + vpxor xmm2, xmm2, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm5, xmm1, 78 + vpshufd xmm6, xmm2, 78 + vpclmulqdq xmm7, xmm2, xmm1, 17 + vpclmulqdq xmm4, xmm2, xmm1, 0 + vpxor xmm5, xmm5, xmm1 + vpxor xmm6, xmm6, xmm2 + vpclmulqdq xmm5, xmm5, xmm6, 0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm5, xmm5, xmm7 + vpslldq xmm6, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm6 + vpxor xmm2, xmm7, xmm5 + vpslld xmm5, xmm4, 31 + vpslld xmm6, xmm4, 30 + vpslld xmm7, xmm4, 25 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm7 + vpsrldq xmm7, xmm5, 4 + vpslldq xmm5, xmm5, 12 + vpxor xmm4, xmm4, xmm5 + vpsrld xmm5, xmm4, 1 + vpsrld xmm6, xmm4, 2 + vpxor xmm5, xmm5, xmm6 + vpxor xmm5, xmm5, xmm4 + vpsrld xmm4, xmm4, 7 + vpxor xmm5, xmm5, xmm7 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpshufb xmm2, xmm2, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm2, [esp+80] + mov edi, DWORD PTR [esp+240] + cmp ebp, 16 + je L_AES_GCM_decrypt_avx1_cmp_tag_16 + sub esp, 16 + xor ecx, ecx + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm4 +L_AES_GCM_decrypt_avx1_cmp_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + xor al, BYTE PTR [esi+ecx] + or bl, al + inc ecx + cmp ecx, ebp + jne L_AES_GCM_decrypt_avx1_cmp_tag_loop + cmp bl, 0 + sete bl + add esp, 16 + xor ecx, ecx + jmp L_AES_GCM_decrypt_avx1_cmp_tag_done +L_AES_GCM_decrypt_avx1_cmp_tag_16: + vmovdqu xmm5, OWORD PTR [esi] + vpcmpeqb xmm4, xmm4, xmm5 + vpmovmskb edx, xmm4 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_avx1_cmp_tag_done: + mov DWORD PTR [edi], ebx + add esp, 176 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_avx1 ENDP +_TEXT ENDS +IFDEF WOLFSSL_AESGCM_STREAM +_TEXT SEGMENT READONLY PARA +AES_GCM_init_avx1 PROC + push ebx + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+36] + mov esi, DWORD PTR [esp+44] + mov edi, DWORD PTR [esp+60] + vpxor xmm4, xmm4, xmm4 + mov edx, DWORD PTR [esp+48] + cmp edx, 12 + jne L_AES_GCM_init_avx1_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vpinsrd xmm4, xmm4, DWORD PTR [esi], 0 + vpinsrd xmm4, xmm4, DWORD PTR [esi+4], 1 + vpinsrd xmm4, xmm4, DWORD PTR [esi+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [ebp] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [ebp+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp DWORD PTR [esp+40], 11 + vmovdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_init_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp DWORD PTR [esp+40], 13 + vmovdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_init_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_init_avx1_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu OWORD PTR [edi], xmm1 + jmp L_AES_GCM_init_avx1_iv_done +L_AES_GCM_init_avx1_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [ebp] + vaesenc xmm5, xmm5, [ebp+16] + vaesenc xmm5, xmm5, [ebp+32] + vaesenc xmm5, xmm5, [ebp+48] + vaesenc xmm5, xmm5, [ebp+64] + vaesenc xmm5, xmm5, [ebp+80] + vaesenc xmm5, xmm5, [ebp+96] + vaesenc xmm5, xmm5, [ebp+112] + vaesenc xmm5, xmm5, [ebp+128] + vaesenc xmm5, xmm5, [ebp+144] + cmp DWORD PTR [esp+40], 11 + vmovdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm1 + vaesenc xmm5, xmm5, [ebp+176] + cmp DWORD PTR [esp+40], 13 + vmovdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm1 + vaesenc xmm5, xmm5, [ebp+208] + vmovdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm1 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_init_avx1_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_avx1_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_avx1_calc_iv_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_avx1_calc_iv_16_loop + mov edx, DWORD PTR [esp+48] + cmp ecx, edx + je L_AES_GCM_init_avx1_calc_iv_done +L_AES_GCM_init_avx1_calc_iv_lt16: + sub esp, 16 + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_init_avx1_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_init_avx1_calc_iv_loop + vmovdqu xmm0, OWORD PTR [esp] + add esp, 16 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_init_avx1_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vpinsrd xmm0, xmm0, edx, 0 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + ; Encrypt counter + vmovdqa xmm0, OWORD PTR [ebp] + vpxor xmm0, xmm0, xmm4 + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vaesenc xmm0, xmm0, [ebp+96] + vaesenc xmm0, xmm0, [ebp+112] + vaesenc xmm0, xmm0, [ebp+128] + vaesenc xmm0, xmm0, [ebp+144] + cmp DWORD PTR [esp+40], 11 + vmovdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+40], 13 + vmovdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edi], xmm0 +L_AES_GCM_init_avx1_iv_done: + mov ebp, DWORD PTR [esp+52] + mov edi, DWORD PTR [esp+56] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_one + vmovdqa OWORD PTR [ebp], xmm5 + vmovdqa OWORD PTR [edi], xmm4 + add esp, 16 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_init_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_aad_update_avx1 PROC + push esi + push edi + mov esi, DWORD PTR [esp+12] + mov edx, DWORD PTR [esp+16] + mov edi, DWORD PTR [esp+20] + mov eax, DWORD PTR [esp+24] + vmovdqa xmm5, OWORD PTR [edi] + vmovdqa xmm6, OWORD PTR [eax] + xor ecx, ecx +L_AES_GCM_aad_update_avx1_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm5, xmm5, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_avx1_16_loop + vmovdqa OWORD PTR [edi], xmm5 + pop edi + pop esi + ret +AES_GCM_aad_update_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_block_avx1 PROC + push esi + push edi + mov ecx, DWORD PTR [esp+12] + mov eax, DWORD PTR [esp+16] + mov edi, DWORD PTR [esp+20] + mov esi, DWORD PTR [esp+24] + mov edx, DWORD PTR [esp+28] + vmovdqu xmm1, OWORD PTR [edx] + vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one + vmovdqu OWORD PTR [edx], xmm1 + vpxor xmm0, xmm0, [ecx] + vaesenc xmm0, xmm0, [ecx+16] + vaesenc xmm0, xmm0, [ecx+32] + vaesenc xmm0, xmm0, [ecx+48] + vaesenc xmm0, xmm0, [ecx+64] + vaesenc xmm0, xmm0, [ecx+80] + vaesenc xmm0, xmm0, [ecx+96] + vaesenc xmm0, xmm0, [ecx+112] + vaesenc xmm0, xmm0, [ecx+128] + vaesenc xmm0, xmm0, [ecx+144] + cmp eax, 11 + vmovdqa xmm1, OWORD PTR [ecx+160] + jl L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ecx+176] + cmp eax, 13 + vmovdqa xmm1, OWORD PTR [ecx+192] + jl L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ecx+208] + vmovdqa xmm1, OWORD PTR [ecx+224] +L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [esi] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edi], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + pop edi + pop esi + ret +AES_GCM_encrypt_block_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_ghash_block_avx1 PROC + mov edx, DWORD PTR [esp+4] + mov eax, DWORD PTR [esp+8] + mov ecx, DWORD PTR [esp+12] + vmovdqa xmm4, OWORD PTR [eax] + vmovdqa xmm5, OWORD PTR [ecx] + vmovdqu xmm0, OWORD PTR [edx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vmovdqa OWORD PTR [eax], xmm4 + ret +AES_GCM_ghash_block_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_update_avx1 PROC + push ebx + push esi + push edi + push ebp + sub esp, 96 + mov esi, DWORD PTR [esp+144] + vmovdqa xmm4, OWORD PTR [esi] + vmovdqu OWORD PTR [esp+64], xmm4 + mov esi, DWORD PTR [esp+136] + mov ebp, DWORD PTR [esp+140] + vmovdqa xmm6, OWORD PTR [esi] + vmovdqa xmm5, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+80], xmm6 + mov ebp, DWORD PTR [esp+116] + mov edi, DWORD PTR [esp+124] + mov esi, DWORD PTR [esp+128] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+132], 64 + mov eax, DWORD PTR [esp+132] + jl L_AES_GCM_encrypt_update_avx1_done_64 + and eax, 4294967232 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm5 + ; H ^ 2 + vpclmulqdq xmm0, xmm5, xmm5, 0 + vpclmulqdq xmm4, xmm5, xmm5, 17 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vmovdqu OWORD PTR [esp+16], xmm4 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm4, 78 + vpclmulqdq xmm3, xmm4, xmm5, 17 + vpclmulqdq xmm0, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm4 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm0, xmm0, xmm2 + vpxor xmm7, xmm3, xmm1 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [esp+32], xmm7 + ; H ^ 4 + vpclmulqdq xmm0, xmm4, xmm4, 0 + vpclmulqdq xmm7, xmm4, xmm4, 17 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [esp+48], xmm7 + ; First 64 bytes of input + vmovdqu xmm0, OWORD PTR [esp+64] + vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm7 + vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm7 + vmovdqa xmm7, OWORD PTR [ebp] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 11 + vmovdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 13 + vmovdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done: + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vmovdqu xmm4, OWORD PTR [esi] + vmovdqu xmm5, OWORD PTR [esi+16] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm5 + vmovdqu OWORD PTR [edi], xmm0 + vmovdqu OWORD PTR [edi+16], xmm1 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm4, OWORD PTR [esi+32] + vmovdqu xmm5, OWORD PTR [esi+48] + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [edi+32], xmm2 + vmovdqu OWORD PTR [edi+48], xmm3 + cmp eax, 64 + mov ebx, 64 + mov ecx, esi + mov edx, edi + jle L_AES_GCM_encrypt_update_avx1_end_64 + ; More 64 bytes of input +L_AES_GCM_encrypt_update_avx1_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm0, OWORD PTR [esp+64] + vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm7 + vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm7 + vmovdqa xmm7, OWORD PTR [ebp] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 11 + vmovdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 13 + vmovdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vmovdqu xmm4, OWORD PTR [ecx] + vmovdqu xmm5, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm5 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm4, OWORD PTR [ecx+32] + vmovdqu xmm5, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; ghash encrypted counter + vmovdqu xmm2, OWORD PTR [esp+80] + vmovdqu xmm7, OWORD PTR [esp+48] + vmovdqu xmm0, OWORD PTR [edx+-64] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm2 + vpshufd xmm1, xmm7, 78 + vpshufd xmm5, xmm0, 78 + vpxor xmm1, xmm1, xmm7 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm3, xmm0, xmm7, 17 + vpclmulqdq xmm2, xmm0, xmm7, 0 + vpclmulqdq xmm1, xmm1, xmm5, 0 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmm7, OWORD PTR [esp+32] + vmovdqu xmm0, OWORD PTR [edx+-48] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [esp+16] + vmovdqu xmm0, OWORD PTR [edx+-32] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [esp] + vmovdqu xmm0, OWORD PTR [edx+-16] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vpslldq xmm5, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm1 + vpslld xmm7, xmm2, 31 + vpslld xmm4, xmm2, 30 + vpslld xmm5, xmm2, 25 + vpxor xmm7, xmm7, xmm4 + vpxor xmm7, xmm7, xmm5 + vpsrldq xmm4, xmm7, 4 + vpslldq xmm7, xmm7, 12 + vpxor xmm2, xmm2, xmm7 + vpsrld xmm5, xmm2, 1 + vpsrld xmm1, xmm2, 2 + vpsrld xmm0, xmm2, 7 + vpxor xmm5, xmm5, xmm1 + vpxor xmm5, xmm5, xmm0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpxor xmm2, xmm2, xmm3 + vmovdqu OWORD PTR [esp+80], xmm2 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_encrypt_update_avx1_ghash_64 +L_AES_GCM_encrypt_update_avx1_end_64: + movdqu xmm6, OWORD PTR [esp+80] + ; Block 1 + vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm5, OWORD PTR [edx] + pshufb xmm5, xmm0 + vmovdqu xmm7, OWORD PTR [esp+48] + pxor xmm5, xmm6 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm5, 17 + vpclmulqdq xmm0, xmm7, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; Block 2 + vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm5, OWORD PTR [edx+16] + pshufb xmm5, xmm0 + vmovdqu xmm7, OWORD PTR [esp+32] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm5, 17 + vpclmulqdq xmm0, xmm7, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; Block 3 + vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm5, OWORD PTR [edx+32] + pshufb xmm5, xmm0 + vmovdqu xmm7, OWORD PTR [esp+16] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm5, 17 + vpclmulqdq xmm0, xmm7, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; Block 4 + vmovdqa xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vmovdqu xmm5, OWORD PTR [edx+48] + pshufb xmm5, xmm0 + vmovdqu xmm7, OWORD PTR [esp] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm5, 17 + vpclmulqdq xmm0, xmm7, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm6, xmm6, xmm2 + vmovdqu xmm5, OWORD PTR [esp] +L_AES_GCM_encrypt_update_avx1_done_64: + mov edx, DWORD PTR [esp+132] + cmp ebx, edx + jge L_AES_GCM_encrypt_update_avx1_done_enc + mov eax, DWORD PTR [esp+132] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_encrypt_update_avx1_last_block_done + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm1, OWORD PTR [esp+64] + vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one + vmovdqu OWORD PTR [esp+64], xmm1 + vpxor xmm0, xmm0, [ebp] + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vaesenc xmm0, xmm0, [ebp+96] + vaesenc xmm0, xmm0, [ebp+112] + vaesenc xmm0, xmm0, [ebp+128] + vaesenc xmm0, xmm0, [ebp+144] + cmp DWORD PTR [esp+120], 11 + vmovdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+120], 13 + vmovdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [ecx] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edx], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm6, xmm6, xmm0 + add ebx, 16 + cmp ebx, eax + jge L_AES_GCM_encrypt_update_avx1_last_block_ghash +L_AES_GCM_encrypt_update_avx1_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm1, OWORD PTR [esp+64] + vmovdqu xmm3, xmm6 + vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one + vmovdqu OWORD PTR [esp+64], xmm1 + vpxor xmm0, xmm0, [ebp] + vpclmulqdq xmm4, xmm3, xmm5, 16 + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vpclmulqdq xmm7, xmm3, xmm5, 1 + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vpclmulqdq xmm1, xmm3, xmm5, 17 + vaesenc xmm0, xmm0, [ebp+96] + vpxor xmm4, xmm4, xmm7 + vpslldq xmm2, xmm4, 8 + vpsrldq xmm4, xmm4, 8 + vaesenc xmm0, xmm0, [ebp+112] + vpclmulqdq xmm7, xmm3, xmm5, 0 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpclmulqdq xmm7, xmm2, xmm3, 16 + vaesenc xmm0, xmm0, [ebp+128] + vpshufd xmm4, xmm2, 78 + vpxor xmm4, xmm4, xmm7 + vpclmulqdq xmm7, xmm4, xmm3, 16 + vaesenc xmm0, xmm0, [ebp+144] + vpshufd xmm6, xmm4, 78 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm1 + cmp DWORD PTR [esp+120], 11 + vmovdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+120], 13 + vmovdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [ecx] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edx], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + add ebx, 16 + vpxor xmm6, xmm6, xmm0 + cmp ebx, eax + jl L_AES_GCM_encrypt_update_avx1_last_block_start +L_AES_GCM_encrypt_update_avx1_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm0, xmm0, xmm2 + vpxor xmm6, xmm3, xmm1 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 +L_AES_GCM_encrypt_update_avx1_last_block_done: +L_AES_GCM_encrypt_update_avx1_done_enc: + mov esi, DWORD PTR [esp+136] + mov edi, DWORD PTR [esp+144] + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqa OWORD PTR [esi], xmm6 + vmovdqu OWORD PTR [edi], xmm4 + add esp, 96 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_encrypt_update_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_final_avx1 PROC + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+32] + mov esi, DWORD PTR [esp+52] + mov edi, DWORD PTR [esp+56] + vmovdqa xmm4, OWORD PTR [ebp] + vmovdqa xmm5, OWORD PTR [esi] + vmovdqa xmm6, OWORD PTR [edi] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpxor xmm5, xmm5, xmm0 + mov edx, DWORD PTR [esp+44] + mov ecx, DWORD PTR [esp+48] + shl edx, 3 + shl ecx, 3 + vpinsrd xmm0, xmm0, edx, 0 + vpinsrd xmm0, xmm0, ecx, 2 + mov edx, DWORD PTR [esp+44] + mov ecx, DWORD PTR [esp+48] + shr edx, 29 + shr ecx, 29 + vpinsrd xmm0, xmm0, edx, 1 + vpinsrd xmm0, xmm0, ecx, 3 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm4, 78 + vpclmulqdq xmm3, xmm4, xmm5, 17 + vpclmulqdq xmm0, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm4 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm0, xmm0, xmm2 + vpxor xmm4, xmm3, xmm1 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm4, xmm6 + mov edi, DWORD PTR [esp+36] + cmp DWORD PTR [esp+40], 16 + je L_AES_GCM_encrypt_final_avx1_store_tag_16 + xor ecx, ecx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_final_avx1_store_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ecx], al + inc ecx + cmp ecx, DWORD PTR [esp+40] + jne L_AES_GCM_encrypt_final_avx1_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx1_store_tag_done +L_AES_GCM_encrypt_final_avx1_store_tag_16: + vmovdqu OWORD PTR [edi], xmm0 +L_AES_GCM_encrypt_final_avx1_store_tag_done: + add esp, 16 + pop ebp + pop edi + pop esi + ret +AES_GCM_encrypt_final_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_update_avx1 PROC + push ebx + push esi + push edi + push ebp + sub esp, 160 + mov esi, DWORD PTR [esp+208] + vmovdqa xmm4, OWORD PTR [esi] + vmovdqu OWORD PTR [esp+64], xmm4 + mov esi, DWORD PTR [esp+200] + mov ebp, DWORD PTR [esp+204] + vmovdqa xmm6, OWORD PTR [esi] + vmovdqa xmm5, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+80], xmm6 + mov ebp, DWORD PTR [esp+180] + mov edi, DWORD PTR [esp+188] + mov esi, DWORD PTR [esp+192] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+196], 64 + mov eax, DWORD PTR [esp+196] + jl L_AES_GCM_decrypt_update_avx1_done_64 + and eax, 4294967232 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm5 + ; H ^ 2 + vpclmulqdq xmm0, xmm5, xmm5, 0 + vpclmulqdq xmm4, xmm5, xmm5, 17 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vmovdqu OWORD PTR [esp+16], xmm4 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm4, 78 + vpclmulqdq xmm3, xmm4, xmm5, 17 + vpclmulqdq xmm0, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm4 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm0, xmm0, xmm2 + vpxor xmm7, xmm3, xmm1 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [esp+32], xmm7 + ; H ^ 4 + vpclmulqdq xmm0, xmm4, xmm4, 0 + vpclmulqdq xmm7, xmm4, xmm4, 17 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm7, xmm7, xmm1 + vmovdqu OWORD PTR [esp+48], xmm7 + cmp edi, esi + jne L_AES_GCM_decrypt_update_avx1_ghash_64 +L_AES_GCM_decrypt_update_avx1_ghash_64_inplace: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm0, OWORD PTR [esp+64] + vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm7 + vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm7 + vmovdqa xmm7, OWORD PTR [ebp] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 11 + vmovdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 13 + vmovdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vmovdqu xmm4, OWORD PTR [ecx] + vmovdqu xmm5, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm5 + vmovdqu OWORD PTR [esp+96], xmm4 + vmovdqu OWORD PTR [esp+112], xmm5 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm4, OWORD PTR [ecx+32] + vmovdqu xmm5, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [esp+128], xmm4 + vmovdqu OWORD PTR [esp+144], xmm5 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; ghash encrypted counter + vmovdqu xmm2, OWORD PTR [esp+80] + vmovdqu xmm7, OWORD PTR [esp+48] + vmovdqu xmm0, OWORD PTR [esp+96] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm2 + vpshufd xmm1, xmm7, 78 + vpshufd xmm5, xmm0, 78 + vpxor xmm1, xmm1, xmm7 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm3, xmm0, xmm7, 17 + vpclmulqdq xmm2, xmm0, xmm7, 0 + vpclmulqdq xmm1, xmm1, xmm5, 0 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmm7, OWORD PTR [esp+32] + vmovdqu xmm0, OWORD PTR [esp+112] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [esp+16] + vmovdqu xmm0, OWORD PTR [esp+128] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [esp] + vmovdqu xmm0, OWORD PTR [esp+144] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vpslldq xmm5, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm1 + vpslld xmm7, xmm2, 31 + vpslld xmm4, xmm2, 30 + vpslld xmm5, xmm2, 25 + vpxor xmm7, xmm7, xmm4 + vpxor xmm7, xmm7, xmm5 + vpsrldq xmm4, xmm7, 4 + vpslldq xmm7, xmm7, 12 + vpxor xmm2, xmm2, xmm7 + vpsrld xmm5, xmm2, 1 + vpsrld xmm1, xmm2, 2 + vpsrld xmm0, xmm2, 7 + vpxor xmm5, xmm5, xmm1 + vpxor xmm5, xmm5, xmm0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpxor xmm2, xmm2, xmm3 + vmovdqu OWORD PTR [esp+80], xmm2 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_avx1_ghash_64_inplace + jmp L_AES_GCM_decrypt_update_avx1_ghash_64_done +L_AES_GCM_decrypt_update_avx1_ghash_64: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm0, OWORD PTR [esp+64] + vpaddd xmm7, xmm0, OWORD PTR L_aes_gcm_avx1_four + vmovdqu OWORD PTR [esp+64], xmm7 + vmovdqa xmm7, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm0, OWORD PTR L_aes_gcm_avx1_one + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm2, xmm0, OWORD PTR L_aes_gcm_avx1_two + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm3, xmm0, OWORD PTR L_aes_gcm_avx1_three + vpshufb xmm3, xmm3, xmm7 + vpshufb xmm0, xmm0, xmm7 + vmovdqa xmm7, OWORD PTR [ebp] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 11 + vmovdqa xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 13 + vmovdqa xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqa xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done: + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vmovdqu xmm4, OWORD PTR [ecx] + vmovdqu xmm5, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm4 + vpxor xmm1, xmm1, xmm5 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm4, OWORD PTR [ecx+32] + vmovdqu xmm5, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm4 + vpxor xmm3, xmm3, xmm5 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; ghash encrypted counter + vmovdqu xmm2, OWORD PTR [esp+80] + vmovdqu xmm7, OWORD PTR [esp+48] + vmovdqu xmm0, OWORD PTR [ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm0, xmm2 + vpshufd xmm1, xmm7, 78 + vpshufd xmm5, xmm0, 78 + vpxor xmm1, xmm1, xmm7 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm3, xmm0, xmm7, 17 + vpclmulqdq xmm2, xmm0, xmm7, 0 + vpclmulqdq xmm1, xmm1, xmm5, 0 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmm7, OWORD PTR [esp+32] + vmovdqu xmm0, OWORD PTR [ecx+16] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [esp+16] + vmovdqu xmm0, OWORD PTR [ecx+32] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [esp] + vmovdqu xmm0, OWORD PTR [ecx+48] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vpclmulqdq xmm7, xmm0, xmm7, 0 + vpclmulqdq xmm4, xmm4, xmm5, 0 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vpslldq xmm5, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm1 + vpslld xmm7, xmm2, 31 + vpslld xmm4, xmm2, 30 + vpslld xmm5, xmm2, 25 + vpxor xmm7, xmm7, xmm4 + vpxor xmm7, xmm7, xmm5 + vpsrldq xmm4, xmm7, 4 + vpslldq xmm7, xmm7, 12 + vpxor xmm2, xmm2, xmm7 + vpsrld xmm5, xmm2, 1 + vpsrld xmm1, xmm2, 2 + vpsrld xmm0, xmm2, 7 + vpxor xmm5, xmm5, xmm1 + vpxor xmm5, xmm5, xmm0 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpxor xmm2, xmm2, xmm3 + vmovdqu OWORD PTR [esp+80], xmm2 + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_avx1_ghash_64 +L_AES_GCM_decrypt_update_avx1_ghash_64_done: + vmovdqa xmm6, xmm2 + vmovdqu xmm5, OWORD PTR [esp] +L_AES_GCM_decrypt_update_avx1_done_64: + mov edx, DWORD PTR [esp+196] + cmp ebx, edx + jge L_AES_GCM_decrypt_update_avx1_done_dec + mov eax, DWORD PTR [esp+196] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_decrypt_update_avx1_last_block_done +L_AES_GCM_decrypt_update_avx1_last_block_start: + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + vmovdqu xmm3, OWORD PTR [ecx] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm3, xmm3, xmm6 + vmovdqu xmm1, OWORD PTR [esp+64] + vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx1_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx1_one + vmovdqu OWORD PTR [esp+64], xmm1 + vpxor xmm0, xmm0, [ebp] + vpclmulqdq xmm4, xmm3, xmm5, 16 + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vpclmulqdq xmm7, xmm3, xmm5, 1 + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vpclmulqdq xmm1, xmm3, xmm5, 17 + vaesenc xmm0, xmm0, [ebp+96] + vpxor xmm4, xmm4, xmm7 + vpslldq xmm2, xmm4, 8 + vpsrldq xmm4, xmm4, 8 + vaesenc xmm0, xmm0, [ebp+112] + vpclmulqdq xmm7, xmm3, xmm5, 0 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqa xmm3, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpclmulqdq xmm7, xmm2, xmm3, 16 + vaesenc xmm0, xmm0, [ebp+128] + vpshufd xmm4, xmm2, 78 + vpxor xmm4, xmm4, xmm7 + vpclmulqdq xmm7, xmm4, xmm3, 16 + vaesenc xmm0, xmm0, [ebp+144] + vpshufd xmm6, xmm4, 78 + vpxor xmm6, xmm6, xmm7 + vpxor xmm6, xmm6, xmm1 + cmp DWORD PTR [esp+184], 11 + vmovdqa xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+184], 13 + vmovdqa xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm0, xmm0, xmm1 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqa xmm1, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [ecx] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edx], xmm0 + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_avx1_last_block_start +L_AES_GCM_decrypt_update_avx1_last_block_done: +L_AES_GCM_decrypt_update_avx1_done_dec: + mov esi, DWORD PTR [esp+200] + mov edi, DWORD PTR [esp+208] + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqa OWORD PTR [esi], xmm6 + vmovdqu OWORD PTR [edi], xmm4 + add esp, 160 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_update_avx1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_final_avx1 PROC + push ebx + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+36] + mov esi, DWORD PTR [esp+56] + mov edi, DWORD PTR [esp+60] + vmovdqa xmm6, OWORD PTR [ebp] + vmovdqa xmm5, OWORD PTR [esi] + vmovdqa xmm7, OWORD PTR [edi] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx1_mod2_128 + vpxor xmm5, xmm5, xmm0 + mov edx, DWORD PTR [esp+48] + mov ecx, DWORD PTR [esp+52] + shl edx, 3 + shl ecx, 3 + vpinsrd xmm0, xmm0, edx, 0 + vpinsrd xmm0, xmm0, ecx, 2 + mov edx, DWORD PTR [esp+48] + mov ecx, DWORD PTR [esp+52] + shr edx, 29 + shr ecx, 29 + vpinsrd xmm0, xmm0, edx, 1 + vpinsrd xmm0, xmm0, ecx, 3 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm0, xmm0, xmm2 + vpxor xmm6, xmm3, xmm1 + vpslld xmm1, xmm0, 31 + vpslld xmm2, xmm0, 30 + vpslld xmm3, xmm0, 25 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vpsrldq xmm3, xmm1, 4 + vpslldq xmm1, xmm1, 12 + vpxor xmm0, xmm0, xmm1 + vpsrld xmm1, xmm0, 1 + vpsrld xmm2, xmm0, 2 + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm0 + vpsrld xmm0, xmm0, 7 + vpxor xmm1, xmm1, xmm3 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vpshufb xmm6, xmm6, OWORD PTR L_aes_gcm_avx1_bswap_mask + vpxor xmm0, xmm6, xmm7 + mov esi, DWORD PTR [esp+40] + mov edi, DWORD PTR [esp+64] + cmp DWORD PTR [esp+44], 16 + je L_AES_GCM_decrypt_final_avx1_cmp_tag_16 + sub esp, 16 + xor ecx, ecx + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_decrypt_final_avx1_cmp_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + xor al, BYTE PTR [esi+ecx] + or bl, al + inc ecx + cmp ecx, DWORD PTR [esp+44] + jne L_AES_GCM_decrypt_final_avx1_cmp_tag_loop + cmp bl, 0 + sete bl + add esp, 16 + xor ecx, ecx + jmp L_AES_GCM_decrypt_final_avx1_cmp_tag_done +L_AES_GCM_decrypt_final_avx1_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [esi] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb edx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_final_avx1_cmp_tag_done: + mov DWORD PTR [edi], ebx + add esp, 16 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_final_avx1 ENDP +_TEXT ENDS +ENDIF +ENDIF +IFDEF HAVE_INTEL_AVX2 +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_avx2 PROC + push ebx + push esi + push edi + push ebp + sub esp, 112 + mov esi, DWORD PTR [esp+144] + mov ebp, DWORD PTR [esp+168] + mov edx, DWORD PTR [esp+160] + vpxor xmm4, xmm4, xmm4 + cmp edx, 12 + je L_AES_GCM_encrypt_avx2_iv_12 + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqu xmm5, OWORD PTR [ebp] + vaesenc xmm5, xmm5, [ebp+16] + vaesenc xmm5, xmm5, [ebp+32] + vaesenc xmm5, xmm5, [ebp+48] + vaesenc xmm5, xmm5, [ebp+64] + vaesenc xmm5, xmm5, [ebp+80] + vaesenc xmm5, xmm5, [ebp+96] + vaesenc xmm5, xmm5, [ebp+112] + vaesenc xmm5, xmm5, [ebp+128] + vaesenc xmm5, xmm5, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_encrypt_avx2_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_avx2_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx2_calc_iv_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_iv_16_loop + mov edx, DWORD PTR [esp+160] + cmp ecx, edx + je L_AES_GCM_encrypt_avx2_calc_iv_done +L_AES_GCM_encrypt_avx2_calc_iv_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_avx2_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_iv_loop + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 +L_AES_GCM_encrypt_avx2_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vpinsrd xmm0, xmm0, edx, 0 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask + ; Encrypt counter + vmovdqu xmm6, OWORD PTR [ebp] + vpxor xmm6, xmm6, xmm4 + vaesenc xmm6, xmm6, [ebp+16] + vaesenc xmm6, xmm6, [ebp+32] + vaesenc xmm6, xmm6, [ebp+48] + vaesenc xmm6, xmm6, [ebp+64] + vaesenc xmm6, xmm6, [ebp+80] + vaesenc xmm6, xmm6, [ebp+96] + vaesenc xmm6, xmm6, [ebp+112] + vaesenc xmm6, xmm6, [ebp+128] + vaesenc xmm6, xmm6, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm6, xmm6, xmm0 + vaesenc xmm6, xmm6, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm6, xmm6, xmm0 + vaesenc xmm6, xmm6, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast xmm6, xmm6, xmm0 + jmp L_AES_GCM_encrypt_avx2_iv_done +L_AES_GCM_encrypt_avx2_iv_12: + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one + vmovdqu xmm5, OWORD PTR [ebp] + vpblendd xmm4, xmm4, [esi], 7 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqu xmm7, OWORD PTR [ebp+16] + vpxor xmm6, xmm4, xmm5 + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm6, xmm6, xmm7 + vmovdqu xmm0, OWORD PTR [ebp+32] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+48] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+64] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+80] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+96] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+112] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+128] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+144] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + cmp DWORD PTR [esp+172], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+176] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + cmp DWORD PTR [esp+172], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+208] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm0 + vaesenclast xmm6, xmm6, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask +L_AES_GCM_encrypt_avx2_iv_done: + vmovdqu OWORD PTR [esp+80], xmm6 + vpxor xmm6, xmm6, xmm6 + mov esi, DWORD PTR [esp+140] + ; Additional authentication data + mov edx, DWORD PTR [esp+156] + cmp edx, 0 + je L_AES_GCM_encrypt_avx2_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_avx2_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx2_calc_aad_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_aad_16_loop + mov edx, DWORD PTR [esp+156] + cmp ecx, edx + je L_AES_GCM_encrypt_avx2_calc_aad_done +L_AES_GCM_encrypt_avx2_calc_aad_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_avx2_calc_aad_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_aad_loop + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 +L_AES_GCM_encrypt_avx2_calc_aad_done: + mov esi, DWORD PTR [esp+132] + mov edi, DWORD PTR [esp+136] + ; Calculate counter and H + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+152], 64 + mov eax, DWORD PTR [esp+152] + jl L_AES_GCM_encrypt_avx2_done_64 + and eax, 4294967232 + vmovdqu OWORD PTR [esp+64], xmm4 + vmovdqu OWORD PTR [esp+96], xmm6 + vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm5 + vmovdqu xmm2, xmm5 + ; H ^ 2 + vpclmulqdq xmm5, xmm2, xmm2, 0 + vpclmulqdq xmm6, xmm2, xmm2, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm0, xmm6, xmm5 + vmovdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red + vpclmulqdq xmm6, xmm2, xmm0, 16 + vpclmulqdq xmm5, xmm2, xmm0, 1 + vpclmulqdq xmm4, xmm2, xmm0, 0 + vpxor xmm6, xmm6, xmm5 + vpslldq xmm5, xmm6, 8 + vpsrldq xmm6, xmm6, 8 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm1, xmm2, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm1, xmm1, xmm6 + vpxor xmm1, xmm1, xmm5 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm5, xmm0, xmm0, 0 + vpclmulqdq xmm6, xmm0, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm6, xmm5 + vmovdqu OWORD PTR [esp+48], xmm2 + vmovdqu xmm6, OWORD PTR [esp+96] + ; First 64 bytes of input + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+172], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx2_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+172], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx2_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [esi] + vmovdqu xmm4, OWORD PTR [esi+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [edi], xmm0 + vmovdqu OWORD PTR [edi+16], xmm1 + vmovdqu xmm7, OWORD PTR [esi+32] + vmovdqu xmm4, OWORD PTR [esi+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [edi+32], xmm2 + vmovdqu OWORD PTR [edi+48], xmm3 + cmp eax, 64 + mov ebx, 64 + mov ecx, esi + mov edx, edi + jle L_AES_GCM_encrypt_avx2_end_64 + ; More 64 bytes of input +L_AES_GCM_encrypt_avx2_ghash_64: + ; aesenc_64_ghash + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+172], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+172], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ecx] + vmovdqu xmm4, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vmovdqu xmm7, OWORD PTR [ecx+32] + vmovdqu xmm4, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; pclmul_1 + vmovdqu xmm1, OWORD PTR [edx+-64] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vmovdqu xmm2, OWORD PTR [esp+48] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + ; pclmul_2 + vmovdqu xmm1, OWORD PTR [edx+-48] + vmovdqu xmm0, OWORD PTR [esp+32] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [edx+-32] + vmovdqu xmm0, OWORD PTR [esp+16] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [edx+-16] + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + ; aesenc_64_ghash - end + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_encrypt_avx2_ghash_64 +L_AES_GCM_encrypt_avx2_end_64: + vmovdqu OWORD PTR [esp+96], xmm6 + vmovdqu xmm3, OWORD PTR [edx+48] + vmovdqu xmm7, OWORD PTR [esp] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpclmulqdq xmm5, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm4, xmm7, xmm3, 0 + vpclmulqdq xmm6, xmm7, xmm3, 17 + vpxor xmm5, xmm5, xmm1 + vmovdqu xmm3, OWORD PTR [edx+32] + vmovdqu xmm7, OWORD PTR [esp+16] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpclmulqdq xmm2, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm0, xmm7, xmm3, 0 + vpclmulqdq xmm3, xmm7, xmm3, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm3, OWORD PTR [edx+16] + vmovdqu xmm7, OWORD PTR [esp+32] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpclmulqdq xmm2, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm0, xmm7, xmm3, 0 + vpclmulqdq xmm3, xmm7, xmm3, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm0, OWORD PTR [esp+96] + vmovdqu xmm3, OWORD PTR [edx] + vmovdqu xmm7, OWORD PTR [esp+48] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm3, xmm3, xmm0 + vpclmulqdq xmm2, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm0, xmm7, xmm3, 0 + vpclmulqdq xmm3, xmm7, xmm3, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpslldq xmm7, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm7 + vpxor xmm6, xmm6, xmm5 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm4, xmm2, 16 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm5, OWORD PTR [esp] + vmovdqu xmm4, OWORD PTR [esp+64] +L_AES_GCM_encrypt_avx2_done_64: + cmp ebx, DWORD PTR [esp+152] + je L_AES_GCM_encrypt_avx2_done_enc + mov eax, DWORD PTR [esp+152] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_encrypt_avx2_last_block_done + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_block + vmovdqu xmm1, xmm4 + vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_one + vpxor xmm0, xmm0, [ebp] + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vaesenc xmm0, xmm0, [ebp+96] + vaesenc xmm0, xmm0, [ebp+112] + vaesenc xmm0, xmm0, [ebp+128] + vaesenc xmm0, xmm0, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqu xmm2, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm2 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqu xmm2, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm2 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqu xmm2, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu xmm4, xmm1 + vmovdqu xmm1, OWORD PTR [ecx] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edx], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm0 + add ebx, 16 + cmp ebx, eax + jge L_AES_GCM_encrypt_avx2_last_block_ghash +L_AES_GCM_encrypt_avx2_last_block_start: + vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one + vmovdqu OWORD PTR [esp+64], xmm4 + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm6, xmm5, 1 + vpclmulqdq xmm3, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 0 + vpclmulqdq xmm4, xmm6, xmm5, 17 + vpxor xmm7, xmm7, [ebp] + vaesenc xmm7, xmm7, [ebp+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm7, xmm7, [ebp+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+48] + vaesenc xmm7, xmm7, [ebp+64] + vaesenc xmm7, xmm7, [ebp+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+96] + vaesenc xmm7, xmm7, [ebp+112] + vaesenc xmm7, xmm7, [ebp+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm7, xmm7, [ebp+144] + vpxor xmm4, xmm4, xmm3 + vpxor xmm2, xmm2, xmm4 + vmovdqu xmm0, OWORD PTR [ebp+160] + cmp DWORD PTR [esp+172], 11 + jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+176] + vmovdqu xmm0, OWORD PTR [ebp+192] + cmp DWORD PTR [esp+172], 13 + jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm7, xmm7, xmm0 + vmovdqu xmm3, OWORD PTR [esi+ebx] + vpxor xmm6, xmm2, xmm1 + vpxor xmm7, xmm7, xmm3 + vmovdqu OWORD PTR [edi+ebx], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm7 + vmovdqu xmm4, OWORD PTR [esp+64] + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_encrypt_avx2_last_block_start +L_AES_GCM_encrypt_avx2_last_block_ghash: + ; ghash_gfmul_red + vpclmulqdq xmm2, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 1 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm6, xmm6, xmm2 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm0 +L_AES_GCM_encrypt_avx2_last_block_done: + mov ecx, DWORD PTR [esp+152] + mov edx, DWORD PTR [esp+152] + and ecx, 15 + jz L_AES_GCM_encrypt_avx2_done_enc + ; aesenc_last15_enc + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpxor xmm4, xmm4, [ebp] + vaesenc xmm4, xmm4, [ebp+16] + vaesenc xmm4, xmm4, [ebp+32] + vaesenc xmm4, xmm4, [ebp+48] + vaesenc xmm4, xmm4, [ebp+64] + vaesenc xmm4, xmm4, [ebp+80] + vaesenc xmm4, xmm4, [ebp+96] + vaesenc xmm4, xmm4, [ebp+112] + vaesenc xmm4, xmm4, [ebp+128] + vaesenc xmm4, xmm4, [ebp+144] + cmp DWORD PTR [esp+172], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm0 + vaesenc xmm4, xmm4, [ebp+176] + cmp DWORD PTR [esp+172], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm0 + vaesenc xmm4, xmm4, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm0 + xor ecx, ecx + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [esp], xmm4 + vmovdqu OWORD PTR [esp+16], xmm0 +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop: + movzx eax, BYTE PTR [esi+ebx] + xor al, BYTE PTR [esp+ecx] + mov BYTE PTR [esp+ecx+16], al + mov BYTE PTR [edi+ebx], al + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm4, OWORD PTR [esp+16] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm2, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 1 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm6, xmm6, xmm2 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm0 +L_AES_GCM_encrypt_avx2_done_enc: + vmovdqu xmm7, OWORD PTR [esp+80] + ; calc_tag + mov ecx, DWORD PTR [esp+152] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 0 + mov ecx, DWORD PTR [esp+156] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 2 + mov ecx, DWORD PTR [esp+152] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 1 + mov ecx, DWORD PTR [esp+156] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 3 + vpxor xmm0, xmm0, xmm6 + ; ghash_gfmul_red + vpclmulqdq xmm4, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm4, xmm4, xmm3 + vpslldq xmm3, xmm4, 8 + vpsrldq xmm4, xmm4, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm4 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm0, xmm0, xmm7 + mov edi, DWORD PTR [esp+148] + mov ebx, DWORD PTR [esp+164] + ; store_tag + cmp ebx, 16 + je L_AES_GCM_encrypt_avx2_store_tag_16 + xor ecx, ecx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_avx2_store_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ecx], al + inc ecx + cmp ecx, ebx + jne L_AES_GCM_encrypt_avx2_store_tag_loop + jmp L_AES_GCM_encrypt_avx2_store_tag_done +L_AES_GCM_encrypt_avx2_store_tag_16: + vmovdqu OWORD PTR [edi], xmm0 +L_AES_GCM_encrypt_avx2_store_tag_done: + add esp, 112 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_encrypt_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_avx2 PROC + push ebx + push esi + push edi + push ebp + sub esp, 176 + mov esi, DWORD PTR [esp+208] + mov ebp, DWORD PTR [esp+232] + vpxor xmm4, xmm4, xmm4 + mov edx, DWORD PTR [esp+224] + cmp edx, 12 + je L_AES_GCM_decrypt_avx2_iv_12 + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqu xmm5, OWORD PTR [ebp] + vaesenc xmm5, xmm5, [ebp+16] + vaesenc xmm5, xmm5, [ebp+32] + vaesenc xmm5, xmm5, [ebp+48] + vaesenc xmm5, xmm5, [ebp+64] + vaesenc xmm5, xmm5, [ebp+80] + vaesenc xmm5, xmm5, [ebp+96] + vaesenc xmm5, xmm5, [ebp+112] + vaesenc xmm5, xmm5, [ebp+128] + vaesenc xmm5, xmm5, [ebp+144] + cmp DWORD PTR [esp+236], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [ebp+176] + cmp DWORD PTR [esp+236], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_decrypt_avx2_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_avx2_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx2_calc_iv_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_iv_16_loop + mov edx, DWORD PTR [esp+224] + cmp ecx, edx + je L_AES_GCM_decrypt_avx2_calc_iv_done +L_AES_GCM_decrypt_avx2_calc_iv_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_decrypt_avx2_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_iv_loop + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 +L_AES_GCM_decrypt_avx2_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vpinsrd xmm0, xmm0, edx, 0 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask + ; Encrypt counter + vmovdqu xmm6, OWORD PTR [ebp] + vpxor xmm6, xmm6, xmm4 + vaesenc xmm6, xmm6, [ebp+16] + vaesenc xmm6, xmm6, [ebp+32] + vaesenc xmm6, xmm6, [ebp+48] + vaesenc xmm6, xmm6, [ebp+64] + vaesenc xmm6, xmm6, [ebp+80] + vaesenc xmm6, xmm6, [ebp+96] + vaesenc xmm6, xmm6, [ebp+112] + vaesenc xmm6, xmm6, [ebp+128] + vaesenc xmm6, xmm6, [ebp+144] + cmp DWORD PTR [esp+236], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm6, xmm6, xmm0 + vaesenc xmm6, xmm6, [ebp+176] + cmp DWORD PTR [esp+236], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm6, xmm6, xmm0 + vaesenc xmm6, xmm6, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast xmm6, xmm6, xmm0 + jmp L_AES_GCM_decrypt_avx2_iv_done +L_AES_GCM_decrypt_avx2_iv_12: + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one + vmovdqu xmm5, OWORD PTR [ebp] + vpblendd xmm4, xmm4, [esi], 7 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqu xmm7, OWORD PTR [ebp+16] + vpxor xmm6, xmm4, xmm5 + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm6, xmm6, xmm7 + vmovdqu xmm0, OWORD PTR [ebp+32] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+48] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+64] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+80] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+96] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+112] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+128] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+144] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + cmp DWORD PTR [esp+236], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+176] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + cmp DWORD PTR [esp+236], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+208] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm6, xmm6, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx2_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm0 + vaesenclast xmm6, xmm6, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask +L_AES_GCM_decrypt_avx2_iv_done: + vmovdqu OWORD PTR [esp+80], xmm6 + vpxor xmm6, xmm6, xmm6 + mov esi, DWORD PTR [esp+204] + ; Additional authentication data + mov edx, DWORD PTR [esp+220] + cmp edx, 0 + je L_AES_GCM_decrypt_avx2_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_avx2_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx2_calc_aad_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_aad_16_loop + mov edx, DWORD PTR [esp+220] + cmp ecx, edx + je L_AES_GCM_decrypt_avx2_calc_aad_done +L_AES_GCM_decrypt_avx2_calc_aad_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_decrypt_avx2_calc_aad_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_aad_loop + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 +L_AES_GCM_decrypt_avx2_calc_aad_done: + mov esi, DWORD PTR [esp+196] + mov edi, DWORD PTR [esp+200] + ; Calculate counter and H + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+216], 64 + mov eax, DWORD PTR [esp+216] + jl L_AES_GCM_decrypt_avx2_done_64 + and eax, 4294967232 + vmovdqu OWORD PTR [esp+64], xmm4 + vmovdqu OWORD PTR [esp+96], xmm6 + vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm5 + vmovdqu xmm2, xmm5 + ; H ^ 2 + vpclmulqdq xmm5, xmm2, xmm2, 0 + vpclmulqdq xmm6, xmm2, xmm2, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm0, xmm6, xmm5 + vmovdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red + vpclmulqdq xmm6, xmm2, xmm0, 16 + vpclmulqdq xmm5, xmm2, xmm0, 1 + vpclmulqdq xmm4, xmm2, xmm0, 0 + vpxor xmm6, xmm6, xmm5 + vpslldq xmm5, xmm6, 8 + vpsrldq xmm6, xmm6, 8 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm1, xmm2, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm1, xmm1, xmm6 + vpxor xmm1, xmm1, xmm5 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm5, xmm0, xmm0, 0 + vpclmulqdq xmm6, xmm0, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm6, xmm5 + vmovdqu OWORD PTR [esp+48], xmm2 + vmovdqu xmm6, OWORD PTR [esp+96] + cmp edi, esi + jne L_AES_GCM_decrypt_avx2_ghash_64 +L_AES_GCM_decrypt_avx2_ghash_64_inplace: + ; aesenc_64_ghash + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+236], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+236], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ecx] + vmovdqu xmm4, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+112], xmm7 + vmovdqu OWORD PTR [esp+128], xmm4 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vmovdqu xmm7, OWORD PTR [ecx+32] + vmovdqu xmm4, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [esp+144], xmm7 + vmovdqu OWORD PTR [esp+160], xmm4 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; pclmul_1 + vmovdqu xmm1, OWORD PTR [esp+112] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vmovdqu xmm2, OWORD PTR [esp+48] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + ; pclmul_2 + vmovdqu xmm1, OWORD PTR [esp+128] + vmovdqu xmm0, OWORD PTR [esp+32] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [esp+144] + vmovdqu xmm0, OWORD PTR [esp+16] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [esp+160] + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + ; aesenc_64_ghash - end + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_avx2_ghash_64_inplace + jmp L_AES_GCM_decrypt_avx2_ghash_64_done +L_AES_GCM_decrypt_avx2_ghash_64: + ; aesenc_64_ghash + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+236], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+236], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ecx] + vmovdqu xmm4, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vmovdqu xmm7, OWORD PTR [ecx+32] + vmovdqu xmm4, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [ecx+32], xmm7 + vmovdqu OWORD PTR [ecx+48], xmm4 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; pclmul_1 + vmovdqu xmm1, OWORD PTR [ecx] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vmovdqu xmm2, OWORD PTR [esp+48] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + ; pclmul_2 + vmovdqu xmm1, OWORD PTR [ecx+16] + vmovdqu xmm0, OWORD PTR [esp+32] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [ecx+32] + vmovdqu xmm0, OWORD PTR [esp+16] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [ecx+48] + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + ; aesenc_64_ghash - end + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_avx2_ghash_64 +L_AES_GCM_decrypt_avx2_ghash_64_done: + vmovdqu xmm5, OWORD PTR [esp] + vmovdqu xmm4, OWORD PTR [esp+64] +L_AES_GCM_decrypt_avx2_done_64: + cmp ebx, DWORD PTR [esp+216] + jge L_AES_GCM_decrypt_avx2_done_dec + mov eax, DWORD PTR [esp+216] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_decrypt_avx2_last_block_done +L_AES_GCM_decrypt_avx2_last_block_start: + vmovdqu xmm0, OWORD PTR [esi+ebx] + vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm4, xmm0, xmm6 + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm4, xmm5, 1 + vpclmulqdq xmm3, xmm4, xmm5, 16 + vpclmulqdq xmm1, xmm4, xmm5, 0 + vpclmulqdq xmm4, xmm4, xmm5, 17 + vpxor xmm7, xmm7, [ebp] + vaesenc xmm7, xmm7, [ebp+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm7, xmm7, [ebp+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+48] + vaesenc xmm7, xmm7, [ebp+64] + vaesenc xmm7, xmm7, [ebp+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+96] + vaesenc xmm7, xmm7, [ebp+112] + vaesenc xmm7, xmm7, [ebp+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm7, xmm7, [ebp+144] + vpxor xmm4, xmm4, xmm3 + vpxor xmm2, xmm2, xmm4 + vmovdqu xmm0, OWORD PTR [ebp+160] + cmp DWORD PTR [esp+236], 11 + jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+176] + vmovdqu xmm0, OWORD PTR [ebp+192] + cmp DWORD PTR [esp+236], 13 + jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm7, xmm7, xmm0 + vmovdqu xmm3, OWORD PTR [esi+ebx] + vpxor xmm6, xmm2, xmm1 + vpxor xmm7, xmm7, xmm3 + vmovdqu OWORD PTR [edi+ebx], xmm7 + vmovdqu xmm4, OWORD PTR [esp+64] + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_decrypt_avx2_last_block_start +L_AES_GCM_decrypt_avx2_last_block_done: + mov ecx, DWORD PTR [esp+216] + mov edx, DWORD PTR [esp+216] + and ecx, 15 + jz L_AES_GCM_decrypt_avx2_done_dec + ; aesenc_last15_dec + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpxor xmm4, xmm4, [ebp] + vaesenc xmm4, xmm4, [ebp+16] + vaesenc xmm4, xmm4, [ebp+32] + vaesenc xmm4, xmm4, [ebp+48] + vaesenc xmm4, xmm4, [ebp+64] + vaesenc xmm4, xmm4, [ebp+80] + vaesenc xmm4, xmm4, [ebp+96] + vaesenc xmm4, xmm4, [ebp+112] + vaesenc xmm4, xmm4, [ebp+128] + vaesenc xmm4, xmm4, [ebp+144] + cmp DWORD PTR [esp+236], 11 + vmovdqu xmm1, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm1 + vaesenc xmm4, xmm4, [ebp+176] + cmp DWORD PTR [esp+236], 13 + vmovdqu xmm1, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm1 + vaesenc xmm4, xmm4, [ebp+208] + vmovdqu xmm1, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm1 + xor ecx, ecx + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [esp], xmm4 + vmovdqu OWORD PTR [esp+16], xmm0 +L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop: + movzx eax, BYTE PTR [esi+ebx] + mov BYTE PTR [esp+ecx+16], al + xor al, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ebx], al + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop + vmovdqu xmm4, OWORD PTR [esp+16] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm2, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 1 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm6, xmm6, xmm2 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm0 +L_AES_GCM_decrypt_avx2_done_dec: + vmovdqu xmm7, OWORD PTR [esp+80] + ; calc_tag + mov ecx, DWORD PTR [esp+216] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 0 + mov ecx, DWORD PTR [esp+220] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 2 + mov ecx, DWORD PTR [esp+216] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 1 + mov ecx, DWORD PTR [esp+220] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 3 + vpxor xmm0, xmm0, xmm6 + ; ghash_gfmul_red + vpclmulqdq xmm4, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm4, xmm4, xmm3 + vpslldq xmm3, xmm4, 8 + vpsrldq xmm4, xmm4, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm4 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm0, xmm0, xmm7 + mov edi, DWORD PTR [esp+212] + mov ebx, DWORD PTR [esp+228] + mov ebp, DWORD PTR [esp+240] + ; cmp_tag + cmp ebx, 16 + je L_AES_GCM_decrypt_avx2_cmp_tag_16 + xor edx, edx + xor ecx, ecx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_decrypt_avx2_cmp_tag_loop: + movzx eax, BYTE PTR [esp+edx] + xor al, BYTE PTR [edi+edx] + or cl, al + inc edx + cmp edx, ebx + jne L_AES_GCM_decrypt_avx2_cmp_tag_loop + cmp cl, 0 + sete cl + jmp L_AES_GCM_decrypt_avx2_cmp_tag_done +L_AES_GCM_decrypt_avx2_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [edi] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb edx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ecx, ecx + cmp edx, 65535 + sete cl +L_AES_GCM_decrypt_avx2_cmp_tag_done: + mov DWORD PTR [ebp], ecx + add esp, 176 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_avx2 ENDP +_TEXT ENDS +IFDEF WOLFSSL_AESGCM_STREAM +_TEXT SEGMENT READONLY PARA +AES_GCM_init_avx2 PROC + push ebx + push esi + push edi + push ebp + sub esp, 32 + mov ebp, DWORD PTR [esp+52] + mov esi, DWORD PTR [esp+60] + mov edi, DWORD PTR [esp+76] + vpxor xmm4, xmm4, xmm4 + mov edx, DWORD PTR [esp+64] + cmp edx, 12 + je L_AES_GCM_init_avx2_iv_12 + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqu xmm5, OWORD PTR [ebp] + vaesenc xmm5, xmm5, [ebp+16] + vaesenc xmm5, xmm5, [ebp+32] + vaesenc xmm5, xmm5, [ebp+48] + vaesenc xmm5, xmm5, [ebp+64] + vaesenc xmm5, xmm5, [ebp+80] + vaesenc xmm5, xmm5, [ebp+96] + vaesenc xmm5, xmm5, [ebp+112] + vaesenc xmm5, xmm5, [ebp+128] + vaesenc xmm5, xmm5, [ebp+144] + cmp DWORD PTR [esp+56], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [ebp+176] + cmp DWORD PTR [esp+56], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov ecx, 0 + je L_AES_GCM_init_avx2_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_avx2_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_avx2_calc_iv_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_avx2_calc_iv_16_loop + mov edx, DWORD PTR [esp+64] + cmp ecx, edx + je L_AES_GCM_init_avx2_calc_iv_done +L_AES_GCM_init_avx2_calc_iv_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_init_avx2_calc_iv_loop: + movzx eax, BYTE PTR [esi+ecx] + mov BYTE PTR [esp+ebx], al + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_init_avx2_calc_iv_loop + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 +L_AES_GCM_init_avx2_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vpinsrd xmm0, xmm0, edx, 0 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_mask + ; Encrypt counter + vmovdqu xmm7, OWORD PTR [ebp] + vpxor xmm7, xmm7, xmm4 + vaesenc xmm7, xmm7, [ebp+16] + vaesenc xmm7, xmm7, [ebp+32] + vaesenc xmm7, xmm7, [ebp+48] + vaesenc xmm7, xmm7, [ebp+64] + vaesenc xmm7, xmm7, [ebp+80] + vaesenc xmm7, xmm7, [ebp+96] + vaesenc xmm7, xmm7, [ebp+112] + vaesenc xmm7, xmm7, [ebp+128] + vaesenc xmm7, xmm7, [ebp+144] + cmp DWORD PTR [esp+56], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+176] + cmp DWORD PTR [esp+56], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm0 + jmp L_AES_GCM_init_avx2_iv_done +L_AES_GCM_init_avx2_iv_12: + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one + vmovdqu xmm5, OWORD PTR [ebp] + vpblendd xmm4, xmm4, [esi], 7 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqu xmm6, OWORD PTR [ebp+16] + vpxor xmm7, xmm4, xmm5 + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm7, xmm7, xmm6 + vmovdqu xmm0, OWORD PTR [ebp+32] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+48] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+64] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+80] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+96] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+112] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+128] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+144] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + cmp DWORD PTR [esp+56], 11 + vmovdqu xmm0, OWORD PTR [ebp+160] + jl L_AES_GCM_init_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+176] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + cmp DWORD PTR [esp+56], 13 + vmovdqu xmm0, OWORD PTR [ebp+192] + jl L_AES_GCM_init_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+208] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_init_avx2_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm0 + vaesenclast xmm7, xmm7, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_bswap_mask +L_AES_GCM_init_avx2_iv_done: + vmovdqu OWORD PTR [edi], xmm7 + mov ebp, DWORD PTR [esp+68] + mov edi, DWORD PTR [esp+72] + vpshufb xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one + vmovdqu OWORD PTR [ebp], xmm5 + vmovdqu OWORD PTR [edi], xmm4 + add esp, 32 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_init_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_aad_update_avx2 PROC + push esi + push edi + mov esi, DWORD PTR [esp+12] + mov edx, DWORD PTR [esp+16] + mov edi, DWORD PTR [esp+20] + mov eax, DWORD PTR [esp+24] + vmovdqu xmm4, OWORD PTR [edi] + vmovdqu xmm5, OWORD PTR [eax] + xor ecx, ecx +L_AES_GCM_aad_update_avx2_16_loop: + vmovdqu xmm0, OWORD PTR [esi+ecx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_avx2_16_loop + vmovdqu OWORD PTR [edi], xmm4 + pop edi + pop esi + ret +AES_GCM_aad_update_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_block_avx2 PROC + push esi + push edi + mov ecx, DWORD PTR [esp+12] + mov eax, DWORD PTR [esp+16] + mov edi, DWORD PTR [esp+20] + mov esi, DWORD PTR [esp+24] + mov edx, DWORD PTR [esp+28] + vmovdqu xmm3, OWORD PTR [edx] + ; aesenc_block + vmovdqu xmm1, xmm3 + vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_one + vpxor xmm0, xmm0, [ecx] + vaesenc xmm0, xmm0, [ecx+16] + vaesenc xmm0, xmm0, [ecx+32] + vaesenc xmm0, xmm0, [ecx+48] + vaesenc xmm0, xmm0, [ecx+64] + vaesenc xmm0, xmm0, [ecx+80] + vaesenc xmm0, xmm0, [ecx+96] + vaesenc xmm0, xmm0, [ecx+112] + vaesenc xmm0, xmm0, [ecx+128] + vaesenc xmm0, xmm0, [ecx+144] + cmp eax, 11 + vmovdqu xmm2, OWORD PTR [ecx+160] + jl L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm2 + vaesenc xmm0, xmm0, [ecx+176] + cmp eax, 13 + vmovdqu xmm2, OWORD PTR [ecx+192] + jl L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm2 + vaesenc xmm0, xmm0, [ecx+208] + vmovdqu xmm2, OWORD PTR [ecx+224] +L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu xmm3, xmm1 + vmovdqu xmm1, OWORD PTR [esi] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edi], xmm0 + vmovdqu OWORD PTR [edx], xmm3 + pop edi + pop esi + ret +AES_GCM_encrypt_block_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_ghash_block_avx2 PROC + mov edx, DWORD PTR [esp+4] + mov eax, DWORD PTR [esp+8] + mov ecx, DWORD PTR [esp+12] + vmovdqu xmm4, OWORD PTR [eax] + vmovdqu xmm5, OWORD PTR [ecx] + vmovdqu xmm0, OWORD PTR [edx] + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vmovdqu OWORD PTR [eax], xmm4 + ret +AES_GCM_ghash_block_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_update_avx2 PROC + push ebx + push esi + push edi + push ebp + sub esp, 96 + mov esi, DWORD PTR [esp+144] + vmovdqu xmm4, OWORD PTR [esi] + vmovdqu OWORD PTR [esp+64], xmm4 + mov esi, DWORD PTR [esp+136] + mov ebp, DWORD PTR [esp+140] + vmovdqu xmm6, OWORD PTR [esi] + vmovdqu xmm5, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+80], xmm6 + mov ebp, DWORD PTR [esp+116] + mov edi, DWORD PTR [esp+124] + mov esi, DWORD PTR [esp+128] + ; Calculate H + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+132], 64 + mov eax, DWORD PTR [esp+132] + jl L_AES_GCM_encrypt_update_avx2_done_64 + and eax, 4294967232 + vmovdqu OWORD PTR [esp+64], xmm4 + vmovdqu OWORD PTR [esp+80], xmm6 + vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm5 + vmovdqu xmm2, xmm5 + ; H ^ 2 + vpclmulqdq xmm5, xmm2, xmm2, 0 + vpclmulqdq xmm6, xmm2, xmm2, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm0, xmm6, xmm5 + vmovdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red + vpclmulqdq xmm6, xmm2, xmm0, 16 + vpclmulqdq xmm5, xmm2, xmm0, 1 + vpclmulqdq xmm4, xmm2, xmm0, 0 + vpxor xmm6, xmm6, xmm5 + vpslldq xmm5, xmm6, 8 + vpsrldq xmm6, xmm6, 8 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm1, xmm2, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm1, xmm1, xmm6 + vpxor xmm1, xmm1, xmm5 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm5, xmm0, xmm0, 0 + vpclmulqdq xmm6, xmm0, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm6, xmm5 + vmovdqu OWORD PTR [esp+48], xmm2 + vmovdqu xmm6, OWORD PTR [esp+80] + ; First 64 bytes of input + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [esi] + vmovdqu xmm4, OWORD PTR [esi+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [edi], xmm0 + vmovdqu OWORD PTR [edi+16], xmm1 + vmovdqu xmm7, OWORD PTR [esi+32] + vmovdqu xmm4, OWORD PTR [esi+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [edi+32], xmm2 + vmovdqu OWORD PTR [edi+48], xmm3 + cmp eax, 64 + mov ebx, 64 + mov ecx, esi + mov edx, edi + jle L_AES_GCM_encrypt_update_avx2_end_64 + ; More 64 bytes of input +L_AES_GCM_encrypt_update_avx2_ghash_64: + ; aesenc_64_ghash + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+120], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ecx] + vmovdqu xmm4, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vmovdqu xmm7, OWORD PTR [ecx+32] + vmovdqu xmm4, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; pclmul_1 + vmovdqu xmm1, OWORD PTR [edx+-64] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vmovdqu xmm2, OWORD PTR [esp+48] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + ; pclmul_2 + vmovdqu xmm1, OWORD PTR [edx+-48] + vmovdqu xmm0, OWORD PTR [esp+32] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [edx+-32] + vmovdqu xmm0, OWORD PTR [esp+16] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [edx+-16] + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + ; aesenc_64_ghash - end + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_encrypt_update_avx2_ghash_64 +L_AES_GCM_encrypt_update_avx2_end_64: + vmovdqu OWORD PTR [esp+80], xmm6 + vmovdqu xmm3, OWORD PTR [edx+48] + vmovdqu xmm7, OWORD PTR [esp] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpclmulqdq xmm5, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm4, xmm7, xmm3, 0 + vpclmulqdq xmm6, xmm7, xmm3, 17 + vpxor xmm5, xmm5, xmm1 + vmovdqu xmm3, OWORD PTR [edx+32] + vmovdqu xmm7, OWORD PTR [esp+16] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpclmulqdq xmm2, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm0, xmm7, xmm3, 0 + vpclmulqdq xmm3, xmm7, xmm3, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm3, OWORD PTR [edx+16] + vmovdqu xmm7, OWORD PTR [esp+32] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpclmulqdq xmm2, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm0, xmm7, xmm3, 0 + vpclmulqdq xmm3, xmm7, xmm3, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm0, OWORD PTR [esp+80] + vmovdqu xmm3, OWORD PTR [edx] + vmovdqu xmm7, OWORD PTR [esp+48] + vpshufb xmm3, xmm3, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm3, xmm3, xmm0 + vpclmulqdq xmm2, xmm7, xmm3, 16 + vpclmulqdq xmm1, xmm7, xmm3, 1 + vpclmulqdq xmm0, xmm7, xmm3, 0 + vpclmulqdq xmm3, xmm7, xmm3, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpslldq xmm7, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm7 + vpxor xmm6, xmm6, xmm5 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpclmulqdq xmm0, xmm4, xmm2, 16 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm5, OWORD PTR [esp] + vmovdqu xmm4, OWORD PTR [esp+64] +L_AES_GCM_encrypt_update_avx2_done_64: + cmp ebx, DWORD PTR [esp+132] + je L_AES_GCM_encrypt_update_avx2_done_enc + mov eax, DWORD PTR [esp+132] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_encrypt_update_avx2_last_block_done + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_block + vmovdqu xmm1, xmm4 + vpshufb xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_one + vpxor xmm0, xmm0, [ebp] + vaesenc xmm0, xmm0, [ebp+16] + vaesenc xmm0, xmm0, [ebp+32] + vaesenc xmm0, xmm0, [ebp+48] + vaesenc xmm0, xmm0, [ebp+64] + vaesenc xmm0, xmm0, [ebp+80] + vaesenc xmm0, xmm0, [ebp+96] + vaesenc xmm0, xmm0, [ebp+112] + vaesenc xmm0, xmm0, [ebp+128] + vaesenc xmm0, xmm0, [ebp+144] + cmp DWORD PTR [esp+120], 11 + vmovdqu xmm2, OWORD PTR [ebp+160] + jl L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm2 + vaesenc xmm0, xmm0, [ebp+176] + cmp DWORD PTR [esp+120], 13 + vmovdqu xmm2, OWORD PTR [ebp+192] + jl L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last + vaesenc xmm0, xmm0, xmm2 + vaesenc xmm0, xmm0, [ebp+208] + vmovdqu xmm2, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last: + vaesenclast xmm0, xmm0, xmm2 + vmovdqu xmm4, xmm1 + vmovdqu xmm1, OWORD PTR [ecx] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [edx], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm0 + add ebx, 16 + cmp ebx, eax + jge L_AES_GCM_encrypt_update_avx2_last_block_ghash +L_AES_GCM_encrypt_update_avx2_last_block_start: + vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one + vmovdqu OWORD PTR [esp+64], xmm4 + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm6, xmm5, 1 + vpclmulqdq xmm3, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 0 + vpclmulqdq xmm4, xmm6, xmm5, 17 + vpxor xmm7, xmm7, [ebp] + vaesenc xmm7, xmm7, [ebp+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm7, xmm7, [ebp+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+48] + vaesenc xmm7, xmm7, [ebp+64] + vaesenc xmm7, xmm7, [ebp+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+96] + vaesenc xmm7, xmm7, [ebp+112] + vaesenc xmm7, xmm7, [ebp+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm7, xmm7, [ebp+144] + vpxor xmm4, xmm4, xmm3 + vpxor xmm2, xmm2, xmm4 + vmovdqu xmm0, OWORD PTR [ebp+160] + cmp DWORD PTR [esp+120], 11 + jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+176] + vmovdqu xmm0, OWORD PTR [ebp+192] + cmp DWORD PTR [esp+120], 13 + jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm7, xmm7, xmm0 + vmovdqu xmm3, OWORD PTR [esi+ebx] + vpxor xmm6, xmm2, xmm1 + vpxor xmm7, xmm7, xmm3 + vmovdqu OWORD PTR [edi+ebx], xmm7 + vpshufb xmm7, xmm7, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm6, xmm6, xmm7 + vmovdqu xmm4, OWORD PTR [esp+64] + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_encrypt_update_avx2_last_block_start +L_AES_GCM_encrypt_update_avx2_last_block_ghash: + ; ghash_gfmul_red + vpclmulqdq xmm2, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 1 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm6, xmm6, xmm2 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm0 +L_AES_GCM_encrypt_update_avx2_last_block_done: +L_AES_GCM_encrypt_update_avx2_done_enc: + mov esi, DWORD PTR [esp+136] + mov edi, DWORD PTR [esp+144] + vmovdqu OWORD PTR [esi], xmm6 + vmovdqu OWORD PTR [edi], xmm4 + add esp, 96 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_encrypt_update_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_encrypt_final_avx2 PROC + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+32] + mov esi, DWORD PTR [esp+52] + mov edi, DWORD PTR [esp+56] + vmovdqu xmm4, OWORD PTR [ebp] + vmovdqu xmm5, OWORD PTR [esi] + vmovdqu xmm6, OWORD PTR [edi] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm5, xmm5, xmm0 + ; calc_tag + mov ecx, DWORD PTR [esp+44] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 0 + mov ecx, DWORD PTR [esp+48] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 2 + mov ecx, DWORD PTR [esp+44] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 1 + mov ecx, DWORD PTR [esp+48] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 3 + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm7, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm7, xmm7, xmm3 + vpslldq xmm3, xmm7, 8 + vpsrldq xmm7, xmm7, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm7 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm0, xmm0, xmm6 + mov edi, DWORD PTR [esp+36] + ; store_tag + cmp DWORD PTR [esp+40], 16 + je L_AES_GCM_encrypt_final_avx2_store_tag_16 + xor ecx, ecx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_encrypt_final_avx2_store_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + mov BYTE PTR [edi+ecx], al + inc ecx + cmp ecx, DWORD PTR [esp+40] + jne L_AES_GCM_encrypt_final_avx2_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx2_store_tag_done +L_AES_GCM_encrypt_final_avx2_store_tag_16: + vmovdqu OWORD PTR [edi], xmm0 +L_AES_GCM_encrypt_final_avx2_store_tag_done: + add esp, 16 + pop ebp + pop edi + pop esi + ret +AES_GCM_encrypt_final_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_update_avx2 PROC + push ebx + push esi + push edi + push ebp + sub esp, 160 + mov esi, DWORD PTR [esp+208] + vmovdqu xmm4, OWORD PTR [esi] + mov esi, DWORD PTR [esp+200] + mov ebp, DWORD PTR [esp+204] + vmovdqu xmm6, OWORD PTR [esi] + vmovdqu xmm5, OWORD PTR [ebp] + mov ebp, DWORD PTR [esp+180] + mov edi, DWORD PTR [esp+188] + mov esi, DWORD PTR [esp+192] + ; Calculate H + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp DWORD PTR [esp+196], 64 + mov eax, DWORD PTR [esp+196] + jl L_AES_GCM_decrypt_update_avx2_done_64 + and eax, 4294967232 + vmovdqu OWORD PTR [esp+64], xmm4 + vmovdqu OWORD PTR [esp+80], xmm6 + vmovdqu xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128 + ; H ^ 1 + vmovdqu OWORD PTR [esp], xmm5 + vmovdqu xmm2, xmm5 + ; H ^ 2 + vpclmulqdq xmm5, xmm2, xmm2, 0 + vpclmulqdq xmm6, xmm2, xmm2, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm0, xmm6, xmm5 + vmovdqu OWORD PTR [esp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red + vpclmulqdq xmm6, xmm2, xmm0, 16 + vpclmulqdq xmm5, xmm2, xmm0, 1 + vpclmulqdq xmm4, xmm2, xmm0, 0 + vpxor xmm6, xmm6, xmm5 + vpslldq xmm5, xmm6, 8 + vpsrldq xmm6, xmm6, 8 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm1, xmm2, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm1, xmm1, xmm6 + vpxor xmm1, xmm1, xmm5 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm5, xmm0, xmm0, 0 + vpclmulqdq xmm6, xmm0, xmm0, 17 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpclmulqdq xmm4, xmm5, xmm3, 16 + vpshufd xmm5, xmm5, 78 + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm6, xmm5 + vmovdqu OWORD PTR [esp+48], xmm2 + vmovdqu xmm6, OWORD PTR [esp+80] + cmp edi, esi + jne L_AES_GCM_decrypt_update_avx2_ghash_64 +L_AES_GCM_decrypt_update_avx2_ghash_64_inplace: + ; aesenc_64_ghash + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ecx] + vmovdqu xmm4, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [esp+96], xmm7 + vmovdqu OWORD PTR [esp+112], xmm4 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vmovdqu xmm7, OWORD PTR [ecx+32] + vmovdqu xmm4, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [esp+128], xmm7 + vmovdqu OWORD PTR [esp+144], xmm4 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; pclmul_1 + vmovdqu xmm1, OWORD PTR [esp+96] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vmovdqu xmm2, OWORD PTR [esp+48] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + ; pclmul_2 + vmovdqu xmm1, OWORD PTR [esp+112] + vmovdqu xmm0, OWORD PTR [esp+32] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [esp+128] + vmovdqu xmm0, OWORD PTR [esp+16] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [esp+144] + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + ; aesenc_64_ghash - end + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_avx2_ghash_64_inplace + jmp L_AES_GCM_decrypt_update_avx2_ghash_64_done +L_AES_GCM_decrypt_update_avx2_ghash_64: + ; aesenc_64_ghash + lea ecx, DWORD PTR [esi+ebx] + lea edx, DWORD PTR [edi+ebx] + ; aesenc_64 + ; aesenc_ctr + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu xmm7, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpaddd xmm1, xmm4, OWORD PTR L_aes_gcm_avx2_one + vpshufb xmm0, xmm4, xmm7 + vpaddd xmm2, xmm4, OWORD PTR L_aes_gcm_avx2_two + vpshufb xmm1, xmm1, xmm7 + vpaddd xmm3, xmm4, OWORD PTR L_aes_gcm_avx2_three + vpshufb xmm2, xmm2, xmm7 + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_four + vpshufb xmm3, xmm3, xmm7 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [ebp] + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+16] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+32] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+48] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+64] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+80] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+96] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+112] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+128] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+144] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 11 + vmovdqu xmm7, OWORD PTR [ebp+160] + jl L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+176] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + cmp DWORD PTR [esp+184], 13 + vmovdqu xmm7, OWORD PTR [ebp+192] + jl L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+208] + vaesenc xmm0, xmm0, xmm7 + vaesenc xmm1, xmm1, xmm7 + vaesenc xmm2, xmm2, xmm7 + vaesenc xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done: + ; aesenc_last + vaesenclast xmm0, xmm0, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vaesenclast xmm2, xmm2, xmm7 + vaesenclast xmm3, xmm3, xmm7 + vmovdqu xmm7, OWORD PTR [ecx] + vmovdqu xmm4, OWORD PTR [ecx+16] + vpxor xmm0, xmm0, xmm7 + vpxor xmm1, xmm1, xmm4 + vmovdqu OWORD PTR [edx], xmm0 + vmovdqu OWORD PTR [edx+16], xmm1 + vmovdqu xmm7, OWORD PTR [ecx+32] + vmovdqu xmm4, OWORD PTR [ecx+48] + vpxor xmm2, xmm2, xmm7 + vpxor xmm3, xmm3, xmm4 + vmovdqu OWORD PTR [ecx+32], xmm7 + vmovdqu OWORD PTR [ecx+48], xmm4 + vmovdqu OWORD PTR [edx+32], xmm2 + vmovdqu OWORD PTR [edx+48], xmm3 + ; pclmul_1 + vmovdqu xmm1, OWORD PTR [ecx] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vmovdqu xmm2, OWORD PTR [esp+48] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + ; pclmul_2 + vmovdqu xmm1, OWORD PTR [ecx+16] + vmovdqu xmm0, OWORD PTR [esp+32] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [ecx+32] + vmovdqu xmm0, OWORD PTR [esp+16] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; pclmul_n + vmovdqu xmm1, OWORD PTR [ecx+48] + vmovdqu xmm0, OWORD PTR [esp] + vpshufb xmm1, xmm1, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vpxor xmm7, xmm7, xmm1 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm0, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + ; aesenc_64_ghash - end + add ebx, 64 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_avx2_ghash_64 +L_AES_GCM_decrypt_update_avx2_ghash_64_done: + vmovdqu xmm5, OWORD PTR [esp] + vmovdqu xmm4, OWORD PTR [esp+64] +L_AES_GCM_decrypt_update_avx2_done_64: + cmp ebx, DWORD PTR [esp+196] + jge L_AES_GCM_decrypt_update_avx2_done_dec + mov eax, DWORD PTR [esp+196] + and eax, 4294967280 + cmp ebx, eax + jge L_AES_GCM_decrypt_update_avx2_last_block_done +L_AES_GCM_decrypt_update_avx2_last_block_start: + vmovdqu xmm0, OWORD PTR [esi+ebx] + vpshufb xmm7, xmm4, OWORD PTR L_aes_gcm_avx2_bswap_epi64 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpaddd xmm4, xmm4, OWORD PTR L_aes_gcm_avx2_one + vmovdqu OWORD PTR [esp+64], xmm4 + vpxor xmm4, xmm0, xmm6 + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm4, xmm5, 1 + vpclmulqdq xmm3, xmm4, xmm5, 16 + vpclmulqdq xmm1, xmm4, xmm5, 0 + vpclmulqdq xmm4, xmm4, xmm5, 17 + vpxor xmm7, xmm7, [ebp] + vaesenc xmm7, xmm7, [ebp+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm7, xmm7, [ebp+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+48] + vaesenc xmm7, xmm7, [ebp+64] + vaesenc xmm7, xmm7, [ebp+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vaesenc xmm7, xmm7, [ebp+96] + vaesenc xmm7, xmm7, [ebp+112] + vaesenc xmm7, xmm7, [ebp+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm7, xmm7, [ebp+144] + vpxor xmm4, xmm4, xmm3 + vpxor xmm2, xmm2, xmm4 + vmovdqu xmm0, OWORD PTR [ebp+160] + cmp DWORD PTR [esp+184], 11 + jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+176] + vmovdqu xmm0, OWORD PTR [ebp+192] + cmp DWORD PTR [esp+184], 13 + jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [ebp+208] + vmovdqu xmm0, OWORD PTR [ebp+224] +L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm7, xmm7, xmm0 + vmovdqu xmm3, OWORD PTR [esi+ebx] + vpxor xmm6, xmm2, xmm1 + vpxor xmm7, xmm7, xmm3 + vmovdqu OWORD PTR [edi+ebx], xmm7 + vmovdqu xmm4, OWORD PTR [esp+64] + add ebx, 16 + cmp ebx, eax + jl L_AES_GCM_decrypt_update_avx2_last_block_start +L_AES_GCM_decrypt_update_avx2_last_block_done: +L_AES_GCM_decrypt_update_avx2_done_dec: + mov esi, DWORD PTR [esp+200] + mov edi, DWORD PTR [esp+208] + vmovdqu xmm4, OWORD PTR [esp+64] + vmovdqu OWORD PTR [esi], xmm6 + vmovdqu OWORD PTR [edi], xmm4 + add esp, 160 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_update_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +AES_GCM_decrypt_final_avx2 PROC + push ebx + push esi + push edi + push ebp + sub esp, 16 + mov ebp, DWORD PTR [esp+36] + mov esi, DWORD PTR [esp+56] + mov edi, DWORD PTR [esp+60] + vmovdqu xmm4, OWORD PTR [ebp] + vmovdqu xmm5, OWORD PTR [esi] + vmovdqu xmm6, OWORD PTR [edi] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_aes_gcm_avx2_mod2_128 + vpxor xmm5, xmm5, xmm0 + ; calc_tag + mov ecx, DWORD PTR [esp+48] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 0 + mov ecx, DWORD PTR [esp+52] + shl ecx, 3 + vpinsrd xmm0, xmm0, ecx, 2 + mov ecx, DWORD PTR [esp+48] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 1 + mov ecx, DWORD PTR [esp+52] + shr ecx, 29 + vpinsrd xmm0, xmm0, ecx, 3 + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm7, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm7, xmm7, xmm3 + vpslldq xmm3, xmm7, 8 + vpsrldq xmm7, xmm7, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_aes_gcm_avx2_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm7 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_aes_gcm_avx2_bswap_mask + vpxor xmm0, xmm0, xmm6 + mov esi, DWORD PTR [esp+40] + mov edi, DWORD PTR [esp+64] + ; cmp_tag + cmp DWORD PTR [esp+44], 16 + je L_AES_GCM_decrypt_final_avx2_cmp_tag_16 + xor ecx, ecx + xor edx, edx + vmovdqu OWORD PTR [esp], xmm0 +L_AES_GCM_decrypt_final_avx2_cmp_tag_loop: + movzx eax, BYTE PTR [esp+ecx] + xor al, BYTE PTR [esi+ecx] + or dl, al + inc ecx + cmp ecx, DWORD PTR [esp+44] + jne L_AES_GCM_decrypt_final_avx2_cmp_tag_loop + cmp dl, 0 + sete dl + jmp L_AES_GCM_decrypt_final_avx2_cmp_tag_done +L_AES_GCM_decrypt_final_avx2_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [esi] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb ecx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor edx, edx + cmp ecx, 65535 + sete dl +L_AES_GCM_decrypt_final_avx2_cmp_tag_done: + mov DWORD PTR [edi], edx + add esp, 16 + pop ebp + pop edi + pop esi + pop ebx + ret +AES_GCM_decrypt_final_avx2 ENDP +_TEXT ENDS +ENDIF +ENDIF +END diff --git a/wolfcrypt/src/aes_x86_64_asm.S b/wolfcrypt/src/aes_x86_64_asm.S index 9eb85b49c73..e2c9c318e04 100644 --- a/wolfcrypt/src/aes_x86_64_asm.S +++ b/wolfcrypt/src/aes_x86_64_asm.S @@ -1141,7 +1141,7 @@ AES_CTR_encrypt_AESNI: .p2align 4 _AES_CTR_encrypt_AESNI: #endif /* __APPLE__ */ - pushq %rbx + pushq %r12 movdqu L_aes_ctr_aesni_bswap(%rip), %xmm8 movdqu L_aes_ctr_aesni_one(%rip), %xmm9 pxor %xmm10, %xmm10 @@ -1156,7 +1156,7 @@ L_AES_CTR_encrypt_AESNI_enc_64: # 64 bytes of input # aes_ctr_enc_64 leaq (%rdi,%rax,1), %r11 - leaq (%rsi,%rax,1), %rbx + leaq (%rsi,%rax,1), %r12 movdqa %xmm7, %xmm0 pshufb %xmm8, %xmm0 paddq %xmm9, %xmm7 @@ -1278,10 +1278,10 @@ L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last: pxor %xmm4, %xmm2 movdqu 48(%r11), %xmm4 pxor %xmm4, %xmm3 - movdqu %xmm0, (%rbx) - movdqu %xmm1, 16(%rbx) - movdqu %xmm2, 32(%rbx) - movdqu %xmm3, 48(%rbx) + movdqu %xmm0, (%r12) + movdqu %xmm1, 16(%r12) + movdqu %xmm2, 32(%r12) + movdqu %xmm3, 48(%r12) addl $0x40, %eax cmpl %r10d, %eax jl L_AES_CTR_encrypt_AESNI_enc_64 @@ -1346,7 +1346,7 @@ L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last: L_AES_CTR_encrypt_AESNI_done_enc: pshufb %xmm8, %xmm7 movdqu %xmm7, (%r9) - popq %rbx + popq %r12 repz retq #ifndef __APPLE__ .size AES_CTR_encrypt_AESNI,.-AES_CTR_encrypt_AESNI @@ -1972,7 +1972,7 @@ AES_CTR_encrypt_avx1: .p2align 4 _AES_CTR_encrypt_avx1: #endif /* __APPLE__ */ - pushq %rbx + pushq %r12 vmovdqu L_aes_ctr_avx1_bswap(%rip), %xmm8 vmovdqu L_aes_ctr_avx1_one(%rip), %xmm9 vpxor %xmm10, %xmm10, %xmm10 @@ -1987,7 +1987,7 @@ L_AES_CTR_encrypt_avx1_enc_64: # 64 bytes of input # aes_ctr_enc_64 leaq (%rdi,%rax,1), %r11 - leaq (%rsi,%rax,1), %rbx + leaq (%rsi,%rax,1), %r12 vpshufb %xmm8, %xmm7, %xmm0 vpaddq %xmm9, %xmm7, %xmm7 vpcmpeqq %xmm10, %xmm7, %xmm11 @@ -2097,10 +2097,10 @@ L_AES_CTR_encrypt_avx1_64_aes_enc_block_last: vpxor 16(%r11), %xmm1, %xmm1 vpxor 32(%r11), %xmm2, %xmm2 vpxor 48(%r11), %xmm3, %xmm3 - vmovdqu %xmm0, (%rbx) - vmovdqu %xmm1, 16(%rbx) - vmovdqu %xmm2, 32(%rbx) - vmovdqu %xmm3, 48(%rbx) + vmovdqu %xmm0, (%r12) + vmovdqu %xmm1, 16(%r12) + vmovdqu %xmm2, 32(%r12) + vmovdqu %xmm3, 48(%r12) addl $0x40, %eax cmpl %r10d, %eax jl L_AES_CTR_encrypt_avx1_enc_64 @@ -2162,7 +2162,7 @@ L_AES_CTR_encrypt_avx1_16_aes_enc_block_last: L_AES_CTR_encrypt_avx1_done_enc: vpshufb %xmm8, %xmm7, %xmm7 vmovdqu %xmm7, (%r9) - popq %rbx + popq %r12 repz retq #ifndef __APPLE__ .size AES_CTR_encrypt_avx1,.-AES_CTR_encrypt_avx1 @@ -2965,7 +2965,7 @@ AES_CTR_encrypt_vaes: .p2align 4 _AES_CTR_encrypt_vaes: #endif /* __APPLE__ */ - pushq %rbx + pushq %r12 vbroadcasti128 L_aes_ctr_bswap_vaes(%rip), %ymm8 vbroadcasti128 (%r9), %ymm7 vpshufb %ymm8, %ymm7, %ymm7 @@ -3016,7 +3016,7 @@ _AES_CTR_encrypt_vaes: L_AES_CTR_encrypt_vaes_enc_128: # 128 bytes of input leaq (%rdi,%rax,1), %r11 - leaq (%rsi,%rax,1), %rbx + leaq (%rsi,%rax,1), %r12 vpshufb %ymm8, %ymm4, %ymm0 vpshufb %ymm8, %ymm5, %ymm1 vpshufb %ymm8, %ymm6, %ymm2 @@ -3142,10 +3142,10 @@ L_AES_CTR_encrypt_vaes_128_aes_enc_block_last: vpxor 32(%r11), %ymm1, %ymm1 vpxor 64(%r11), %ymm2, %ymm2 vpxor 96(%r11), %ymm3, %ymm3 - vmovdqu %ymm0, (%rbx) - vmovdqu %ymm1, 32(%rbx) - vmovdqu %ymm2, 64(%rbx) - vmovdqu %ymm3, 96(%rbx) + vmovdqu %ymm0, (%r12) + vmovdqu %ymm1, 32(%r12) + vmovdqu %ymm2, 64(%r12) + vmovdqu %ymm3, 96(%r12) addl $0x80, %eax cmpl %r10d, %eax jl L_AES_CTR_encrypt_vaes_enc_128 @@ -3159,7 +3159,7 @@ L_AES_CTR_encrypt_vaes_enc_32: # 32 bytes of input # aes_ctr_enc_32 leaq (%rdi,%rax,1), %r11 - leaq (%rsi,%rax,1), %rbx + leaq (%rsi,%rax,1), %r12 vpaddq 0+L_aes_ctr_inc_vaes(%rip), %ymm7, %ymm0 vmovdqa %ymm7, %ymm9 vpand 0+L_aes_ctr_inc_vaes(%rip), %ymm9, %ymm14 @@ -3216,7 +3216,7 @@ L_AES_CTR_encrypt_vaes_enc_32: L_AES_CTR_encrypt_vaes_32_aes_enc_block_last: vaesenclast %ymm13, %ymm0, %ymm0 vpxor (%r11), %ymm0, %ymm0 - vmovdqu %ymm0, (%rbx) + vmovdqu %ymm0, (%r12) addl $32, %eax cmpl %r10d, %eax jl L_AES_CTR_encrypt_vaes_enc_32 @@ -3282,7 +3282,7 @@ L_AES_CTR_encrypt_vaes_16_aes_enc_block_last: L_AES_CTR_encrypt_vaes_done_enc: vpshufb %xmm8, %xmm7, %xmm0 vmovdqu %xmm0, (%r9) - popq %rbx + popq %r12 repz retq #ifndef __APPLE__ .size AES_CTR_encrypt_vaes,.-AES_CTR_encrypt_vaes @@ -4080,7 +4080,7 @@ AES_CTR_encrypt_avx512: .p2align 4 _AES_CTR_encrypt_avx512: #endif /* __APPLE__ */ - pushq %rbx + pushq %r12 vbroadcasti32x4 L_aes_ctr_bswap_avx512(%rip), %zmm8 vbroadcasti32x4 (%r9), %zmm7 vpshufb %zmm8, %zmm7, %zmm7 @@ -4141,7 +4141,7 @@ L_AES_CTR_encrypt_avx512_key_cached: L_AES_CTR_encrypt_avx512_enc_256: # 256 bytes of input leaq (%rdi,%rax,1), %r11 - leaq (%rsi,%rax,1), %rbx + leaq (%rsi,%rax,1), %r12 vpshufb %zmm8, %zmm4, %zmm0 vpshufb %zmm8, %zmm5, %zmm1 vpshufb %zmm8, %zmm6, %zmm2 @@ -4243,10 +4243,10 @@ L_AES_CTR_encrypt_avx512_256_aes_enc_block_last: vpxorq 64(%r11), %zmm1, %zmm1 vpxorq 128(%r11), %zmm2, %zmm2 vpxorq 192(%r11), %zmm3, %zmm3 - vmovdqu64 %zmm0, (%rbx) - vmovdqu64 %zmm1, 64(%rbx) - vmovdqu64 %zmm2, 128(%rbx) - vmovdqu64 %zmm3, 192(%rbx) + vmovdqu64 %zmm0, (%r12) + vmovdqu64 %zmm1, 64(%r12) + vmovdqu64 %zmm2, 128(%r12) + vmovdqu64 %zmm3, 192(%r12) addl $0x100, %eax cmpl %r10d, %eax jl L_AES_CTR_encrypt_avx512_enc_256 @@ -4260,7 +4260,7 @@ L_AES_CTR_encrypt_avx512_enc_64: # 64 bytes of input # aes_ctr_enc_64 leaq (%rdi,%rax,1), %r11 - leaq (%rsi,%rax,1), %rbx + leaq (%rsi,%rax,1), %r12 vpaddq 0+L_aes_ctr_inc_avx512(%rip), %zmm7, %zmm0 vmovdqa64 %zmm7, %zmm9 vpternlogq $0xb2, 0+L_aes_ctr_inc_avx512(%rip), %zmm0, %zmm9 @@ -4299,7 +4299,7 @@ L_AES_CTR_encrypt_avx512_enc_64: L_AES_CTR_encrypt_avx512_64_aes_enc_block_last: vaesenclast %zmm13, %zmm0, %zmm0 vpxorq (%r11), %zmm0, %zmm0 - vmovdqu64 %zmm0, (%rbx) + vmovdqu64 %zmm0, (%r12) addl $0x40, %eax cmpl %r10d, %eax jl L_AES_CTR_encrypt_avx512_enc_64 @@ -4362,7 +4362,7 @@ L_AES_CTR_encrypt_avx512_16_aes_enc_block_last: L_AES_CTR_encrypt_avx512_done_enc: vpshufb %xmm8, %xmm7, %xmm0 vmovdqu %xmm0, (%r9) - popq %rbx + popq %r12 repz retq #ifndef __APPLE__ .size AES_CTR_encrypt_avx512,.-AES_CTR_encrypt_avx512 diff --git a/wolfcrypt/src/aes_x86_64_asm.asm b/wolfcrypt/src/aes_x86_64_asm.asm index 26ccbb5ee8e..aacbd440da3 100644 --- a/wolfcrypt/src/aes_x86_64_asm.asm +++ b/wolfcrypt/src/aes_x86_64_asm.asm @@ -470,23 +470,25 @@ AES_256_Key_Expansion_AESNI ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_ECB_encrypt_AESNI PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 16 movdqu OWORD PTR [rsp], xmm6 - xor eax, eax + xor r10d, r10d cmp r8d, 64 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_encrypt_AESNI_done_64 - and r9d, 4294967232 + and r11d, 4294967232 L_AES_ECB_encrypt_AESNI_enc_64: ; 64 bytes of input ; aes_ecb_enc_64 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - movdqu xmm0, OWORD PTR [r10] - movdqu xmm1, OWORD PTR [r10+16] - movdqu xmm2, OWORD PTR [r10+32] - movdqu xmm3, OWORD PTR [r10+48] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + movdqu xmm0, OWORD PTR [r12] + movdqu xmm1, OWORD PTR [r12+16] + movdqu xmm2, OWORD PTR [r12+32] + movdqu xmm3, OWORD PTR [r12+48] ; aes_enc_block movdqu xmm4, OWORD PTR [r9] pxor xmm0, xmm4 @@ -568,22 +570,22 @@ L_AES_ECB_encrypt_AESNI_64_aes_enc_block_last: aesenclast xmm1, xmm4 aesenclast xmm2, xmm4 aesenclast xmm3, xmm4 - movdqu OWORD PTR [r11], xmm0 - movdqu OWORD PTR [r11+16], xmm1 - movdqu OWORD PTR [r11+32], xmm2 - movdqu OWORD PTR [r11+48], xmm3 - add eax, 64 - cmp eax, r9d + movdqu OWORD PTR [r13], xmm0 + movdqu OWORD PTR [r13+16], xmm1 + movdqu OWORD PTR [r13+32], xmm2 + movdqu OWORD PTR [r13+48], xmm3 + add r10d, 64 + cmp r10d, r11d jl L_AES_ECB_encrypt_AESNI_enc_64 L_AES_ECB_encrypt_AESNI_done_64: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_encrypt_AESNI_done_enc - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_encrypt_AESNI_enc_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - movdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + movdqu xmm0, OWORD PTR [r12] ; aes_enc_block pxor xmm0, [r9] movdqu xmm5, OWORD PTR [r9+16] @@ -619,36 +621,40 @@ L_AES_ECB_encrypt_AESNI_enc_16: movdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_encrypt_AESNI_16_aes_enc_block_last: aesenclast xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - movdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + movdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_encrypt_AESNI_enc_16 L_AES_ECB_encrypt_AESNI_done_enc: movdqu xmm6, OWORD PTR [rsp] add rsp, 16 + pop r13 + pop r12 ret AES_ECB_encrypt_AESNI ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_ECB_decrypt_AESNI PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 16 movdqu OWORD PTR [rsp], xmm6 - xor eax, eax + xor r10d, r10d cmp r8d, 64 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_decrypt_AESNI_done_64 - and r9d, 4294967232 + and r11d, 4294967232 L_AES_ECB_decrypt_AESNI_dec_64: ; 64 bytes of input ; aes_ecb_dec_64 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - movdqu xmm0, OWORD PTR [r10] - movdqu xmm1, OWORD PTR [r10+16] - movdqu xmm2, OWORD PTR [r10+32] - movdqu xmm3, OWORD PTR [r10+48] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + movdqu xmm0, OWORD PTR [r12] + movdqu xmm1, OWORD PTR [r12+16] + movdqu xmm2, OWORD PTR [r12+32] + movdqu xmm3, OWORD PTR [r12+48] ; aes_dec_block movdqu xmm4, OWORD PTR [r9] pxor xmm0, xmm4 @@ -730,22 +736,22 @@ L_AES_ECB_decrypt_AESNI_64_aes_dec_block_last: aesdeclast xmm1, xmm4 aesdeclast xmm2, xmm4 aesdeclast xmm3, xmm4 - movdqu OWORD PTR [r11], xmm0 - movdqu OWORD PTR [r11+16], xmm1 - movdqu OWORD PTR [r11+32], xmm2 - movdqu OWORD PTR [r11+48], xmm3 - add eax, 64 - cmp eax, r9d + movdqu OWORD PTR [r13], xmm0 + movdqu OWORD PTR [r13+16], xmm1 + movdqu OWORD PTR [r13+32], xmm2 + movdqu OWORD PTR [r13+48], xmm3 + add r10d, 64 + cmp r10d, r11d jl L_AES_ECB_decrypt_AESNI_dec_64 L_AES_ECB_decrypt_AESNI_done_64: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_decrypt_AESNI_done_dec - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_decrypt_AESNI_dec_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - movdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + movdqu xmm0, OWORD PTR [r12] ; aes_dec_block pxor xmm0, [r9] movdqu xmm5, OWORD PTR [r9+16] @@ -781,29 +787,33 @@ L_AES_ECB_decrypt_AESNI_dec_16: movdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_decrypt_AESNI_16_aes_dec_block_last: aesdeclast xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - movdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + movdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_decrypt_AESNI_dec_16 L_AES_ECB_decrypt_AESNI_done_dec: movdqu xmm6, OWORD PTR [rsp] add rsp, 16 + pop r13 + pop r12 ret AES_ECB_decrypt_AESNI ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_encrypt_AESNI PROC - mov rax, QWORD PTR [rsp+40] - mov r10d, DWORD PTR [rsp+48] + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + mov r10d, DWORD PTR [rsp+64] movdqu xmm0, OWORD PTR [r8] - xor eax, eax - cmp eax, r9d + xor r11d, r11d + cmp r11d, r9d je L_AES_CBC_encrypt_AESNI_done L_AES_CBC_encrypt_AESNI_loop: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - movdqu xmm1, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r11] + movdqu xmm1, OWORD PTR [r12] pxor xmm1, xmm0 ; aes_enc_block pxor xmm1, [rax] @@ -840,41 +850,45 @@ L_AES_CBC_encrypt_AESNI_loop: movdqu xmm3, OWORD PTR [rax+224] L_AES_CBC_encrypt_AESNI_aes_enc_block_last: aesenclast xmm1, xmm3 - lea r11, QWORD PTR [rdx+rax] - movdqu OWORD PTR [r11], xmm1 + lea r13, QWORD PTR [rdx+r11] + movdqu OWORD PTR [r13], xmm1 movdqa xmm0, xmm1 - add eax, 16 - cmp eax, r9d + add r11d, 16 + cmp r11d, r9d jl L_AES_CBC_encrypt_AESNI_loop L_AES_CBC_encrypt_AESNI_done: movdqu OWORD PTR [r8], xmm0 + pop r13 + pop r12 ret AES_CBC_encrypt_AESNI ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_decrypt_AESNI PROC push r12 - mov rax, QWORD PTR [rsp+48] - mov r10d, DWORD PTR [rsp+56] + push r13 + push r14 + mov rax, QWORD PTR [rsp+64] + mov r10d, DWORD PTR [rsp+72] sub rsp, 48 movdqu OWORD PTR [rsp], xmm6 movdqu OWORD PTR [rsp+16], xmm7 movdqu OWORD PTR [rsp+32], xmm8 movdqu xmm4, OWORD PTR [r8] - xor eax, eax + xor r11d, r11d cmp r9d, 64 - mov r10d, r9d + mov r12d, r9d jl L_AES_CBC_decrypt_AESNI_done_64 - and r10d, 4294967232 + and r12d, 4294967232 L_AES_CBC_decrypt_AESNI_dec_64: ; 64 bytes of input ; aes_cbc_dec_64 - lea r11, QWORD PTR [rcx+rax] - lea r12, QWORD PTR [rdx+rax] - movdqu xmm0, OWORD PTR [r11] - movdqu xmm1, OWORD PTR [r11+16] - movdqu xmm2, OWORD PTR [r11+32] - movdqu xmm3, OWORD PTR [r11+48] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] + movdqu xmm0, OWORD PTR [r13] + movdqu xmm1, OWORD PTR [r13+16] + movdqu xmm2, OWORD PTR [r13+32] + movdqu xmm3, OWORD PTR [r13+48] ; aes_dec_block movdqu xmm5, OWORD PTR [rax] pxor xmm0, xmm5 @@ -957,29 +971,29 @@ L_AES_CBC_decrypt_AESNI_64_aes_dec_block_last: aesdeclast xmm2, xmm5 aesdeclast xmm3, xmm5 pxor xmm0, xmm4 - movdqu xmm5, OWORD PTR [r11] + movdqu xmm5, OWORD PTR [r13] pxor xmm1, xmm5 - movdqu xmm5, OWORD PTR [r11+16] + movdqu xmm5, OWORD PTR [r13+16] pxor xmm2, xmm5 - movdqu xmm5, OWORD PTR [r11+32] + movdqu xmm5, OWORD PTR [r13+32] pxor xmm3, xmm5 - movdqu xmm4, OWORD PTR [r11+48] - movdqu OWORD PTR [r12], xmm0 - movdqu OWORD PTR [r12+16], xmm1 - movdqu OWORD PTR [r12+32], xmm2 - movdqu OWORD PTR [r12+48], xmm3 - add eax, 64 - cmp eax, r10d + movdqu xmm4, OWORD PTR [r13+48] + movdqu OWORD PTR [r14], xmm0 + movdqu OWORD PTR [r14+16], xmm1 + movdqu OWORD PTR [r14+32], xmm2 + movdqu OWORD PTR [r14+48], xmm3 + add r11d, 64 + cmp r11d, r12d jl L_AES_CBC_decrypt_AESNI_dec_64 L_AES_CBC_decrypt_AESNI_done_64: - cmp eax, r9d - mov r10d, r9d + cmp r11d, r9d + mov r12d, r9d je L_AES_CBC_decrypt_AESNI_done_dec - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CBC_decrypt_AESNI_dec_16: ; 16 bytes of input - lea r11, QWORD PTR [rcx+rax] - movdqu xmm0, OWORD PTR [r11] + lea r13, QWORD PTR [rcx+r11] + movdqu xmm0, OWORD PTR [r13] movdqa xmm8, xmm0 ; aes_dec_block pxor xmm0, [rax] @@ -1018,10 +1032,10 @@ L_AES_CBC_decrypt_AESNI_16_aes_dec_block_last: aesdeclast xmm0, xmm6 pxor xmm0, xmm4 movdqa xmm4, xmm8 - lea r11, QWORD PTR [rdx+rax] - movdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rdx+r11] + movdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CBC_decrypt_AESNI_dec_16 L_AES_CBC_decrypt_AESNI_done_dec: movdqu OWORD PTR [r8], xmm4 @@ -1029,27 +1043,29 @@ L_AES_CBC_decrypt_AESNI_done_dec: movdqu xmm7, OWORD PTR [rsp+16] movdqu xmm8, OWORD PTR [rsp+32] add rsp, 48 + pop r14 + pop r13 pop r12 ret AES_CBC_decrypt_AESNI ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_aesni_bswap QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_aes_ctr_aesni_bswap QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_aes_ctr_aesni_bswap QWORD L_aes_ctr_aesni_bswap _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_aesni_one QWORD \ - 0000000000000001h, 0000000000000000h +L_aes_ctr_aesni_one QWORD 0000000000000001h, 0000000000000000h ptr_L_aes_ctr_aesni_one QWORD L_aes_ctr_aesni_one _DATA ENDS _TEXT SEGMENT READONLY PARA AES_CTR_encrypt_AESNI PROC - push rbx - mov eax, DWORD PTR [rsp+48] - mov r10, QWORD PTR [rsp+56] + push r12 + push r13 + push r14 + mov eax, DWORD PTR [rsp+64] + mov r10, QWORD PTR [rsp+72] sub rsp, 96 movdqu OWORD PTR [rsp], xmm6 movdqu OWORD PTR [rsp+16], xmm7 @@ -1062,16 +1078,16 @@ AES_CTR_encrypt_AESNI PROC pxor xmm10, xmm10 movdqu xmm7, OWORD PTR [r10] pshufb xmm7, xmm8 - xor eax, eax + xor r11d, r11d cmp r8d, 64 - mov r10d, r8d + mov r12d, r8d jl L_AES_CTR_encrypt_AESNI_done_64 - and r10d, 4294967232 + and r12d, 4294967232 L_AES_CTR_encrypt_AESNI_enc_64: ; 64 bytes of input ; aes_ctr_enc_64 - lea r11, QWORD PTR [rcx+rax] - lea rbx, QWORD PTR [rdx+rax] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] movdqa xmm0, xmm7 pshufb xmm0, xmm8 paddq xmm7, xmm9 @@ -1185,26 +1201,26 @@ L_AES_CTR_encrypt_AESNI_64_aes_enc_block_last: aesenclast xmm1, xmm4 aesenclast xmm2, xmm4 aesenclast xmm3, xmm4 - movdqu xmm4, OWORD PTR [r11] + movdqu xmm4, OWORD PTR [r13] pxor xmm0, xmm4 - movdqu xmm4, OWORD PTR [r11+16] + movdqu xmm4, OWORD PTR [r13+16] pxor xmm1, xmm4 - movdqu xmm4, OWORD PTR [r11+32] + movdqu xmm4, OWORD PTR [r13+32] pxor xmm2, xmm4 - movdqu xmm4, OWORD PTR [r11+48] + movdqu xmm4, OWORD PTR [r13+48] pxor xmm3, xmm4 - movdqu OWORD PTR [rbx], xmm0 - movdqu OWORD PTR [rbx+16], xmm1 - movdqu OWORD PTR [rbx+32], xmm2 - movdqu OWORD PTR [rbx+48], xmm3 - add eax, 64 - cmp eax, r10d + movdqu OWORD PTR [r14], xmm0 + movdqu OWORD PTR [r14+16], xmm1 + movdqu OWORD PTR [r14+32], xmm2 + movdqu OWORD PTR [r14+48], xmm3 + add r11d, 64 + cmp r11d, r12d jl L_AES_CTR_encrypt_AESNI_enc_64 L_AES_CTR_encrypt_AESNI_done_64: - cmp eax, r8d - mov r10d, r8d + cmp r11d, r8d + mov r12d, r8d je L_AES_CTR_encrypt_AESNI_done_enc - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CTR_encrypt_AESNI_enc_16: ; 16 bytes of input movdqa xmm0, xmm7 @@ -1250,13 +1266,13 @@ L_AES_CTR_encrypt_AESNI_enc_16: movdqu xmm5, OWORD PTR [r9+224] L_AES_CTR_encrypt_AESNI_16_aes_enc_block_last: aesenclast xmm0, xmm5 - lea r11, QWORD PTR [rcx+rax] - movdqu xmm4, OWORD PTR [r11] + lea r13, QWORD PTR [rcx+r11] + movdqu xmm4, OWORD PTR [r13] pxor xmm0, xmm4 - lea r11, QWORD PTR [rdx+rax] - movdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rdx+r11] + movdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CTR_encrypt_AESNI_enc_16 L_AES_CTR_encrypt_AESNI_done_enc: pshufb xmm7, xmm8 @@ -1268,30 +1284,34 @@ L_AES_CTR_encrypt_AESNI_done_enc: movdqu xmm10, OWORD PTR [rsp+64] movdqu xmm11, OWORD PTR [rsp+80] add rsp, 96 - pop rbx + pop r14 + pop r13 + pop r12 ret AES_CTR_encrypt_AESNI ENDP _TEXT ENDS IFDEF HAVE_INTEL_AVX1 _TEXT SEGMENT READONLY PARA AES_ECB_encrypt_avx1 PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 16 vmovdqu OWORD PTR [rsp], xmm6 - xor eax, eax + xor r10d, r10d cmp r8d, 64 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_encrypt_avx1_done_64 - and r9d, 4294967232 + and r11d, 4294967232 L_AES_ECB_encrypt_avx1_enc_64: ; 64 bytes of input ; aes_ecb_enc_64 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu xmm0, OWORD PTR [r10] - vmovdqu xmm1, OWORD PTR [r10+16] - vmovdqu xmm2, OWORD PTR [r10+32] - vmovdqu xmm3, OWORD PTR [r10+48] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu xmm0, OWORD PTR [r12] + vmovdqu xmm1, OWORD PTR [r12+16] + vmovdqu xmm2, OWORD PTR [r12+32] + vmovdqu xmm3, OWORD PTR [r12+48] ; aes_enc_block vmovdqu xmm4, OWORD PTR [r9] vpxor xmm0, xmm0, xmm4 @@ -1373,22 +1393,22 @@ L_AES_ECB_encrypt_avx1_64_aes_enc_block_last: vaesenclast xmm1, xmm1, xmm4 vaesenclast xmm2, xmm2, xmm4 vaesenclast xmm3, xmm3, xmm4 - vmovdqu OWORD PTR [r11], xmm0 - vmovdqu OWORD PTR [r11+16], xmm1 - vmovdqu OWORD PTR [r11+32], xmm2 - vmovdqu OWORD PTR [r11+48], xmm3 - add eax, 64 - cmp eax, r9d + vmovdqu OWORD PTR [r13], xmm0 + vmovdqu OWORD PTR [r13+16], xmm1 + vmovdqu OWORD PTR [r13+32], xmm2 + vmovdqu OWORD PTR [r13+48], xmm3 + add r10d, 64 + cmp r10d, r11d jl L_AES_ECB_encrypt_avx1_enc_64 L_AES_ECB_encrypt_avx1_done_64: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_encrypt_avx1_done_enc - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_encrypt_avx1_enc_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + vmovdqu xmm0, OWORD PTR [r12] ; aes_enc_block vpxor xmm0, xmm0, [r9] vmovdqu xmm5, OWORD PTR [r9+16] @@ -1424,36 +1444,40 @@ L_AES_ECB_encrypt_avx1_enc_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_encrypt_avx1_16_aes_enc_block_last: vaesenclast xmm0, xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + vmovdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_encrypt_avx1_enc_16 L_AES_ECB_encrypt_avx1_done_enc: vmovdqu xmm6, OWORD PTR [rsp] add rsp, 16 + pop r13 + pop r12 ret AES_ECB_encrypt_avx1 ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_ECB_decrypt_avx1 PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 16 vmovdqu OWORD PTR [rsp], xmm6 - xor eax, eax + xor r10d, r10d cmp r8d, 64 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_decrypt_avx1_done_64 - and r9d, 4294967232 + and r11d, 4294967232 L_AES_ECB_decrypt_avx1_dec_64: ; 64 bytes of input ; aes_ecb_dec_64 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu xmm0, OWORD PTR [r10] - vmovdqu xmm1, OWORD PTR [r10+16] - vmovdqu xmm2, OWORD PTR [r10+32] - vmovdqu xmm3, OWORD PTR [r10+48] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu xmm0, OWORD PTR [r12] + vmovdqu xmm1, OWORD PTR [r12+16] + vmovdqu xmm2, OWORD PTR [r12+32] + vmovdqu xmm3, OWORD PTR [r12+48] ; aes_dec_block vmovdqu xmm4, OWORD PTR [r9] vpxor xmm0, xmm0, xmm4 @@ -1535,22 +1559,22 @@ L_AES_ECB_decrypt_avx1_64_aes_dec_block_last: vaesdeclast xmm1, xmm1, xmm4 vaesdeclast xmm2, xmm2, xmm4 vaesdeclast xmm3, xmm3, xmm4 - vmovdqu OWORD PTR [r11], xmm0 - vmovdqu OWORD PTR [r11+16], xmm1 - vmovdqu OWORD PTR [r11+32], xmm2 - vmovdqu OWORD PTR [r11+48], xmm3 - add eax, 64 - cmp eax, r9d + vmovdqu OWORD PTR [r13], xmm0 + vmovdqu OWORD PTR [r13+16], xmm1 + vmovdqu OWORD PTR [r13+32], xmm2 + vmovdqu OWORD PTR [r13+48], xmm3 + add r10d, 64 + cmp r10d, r11d jl L_AES_ECB_decrypt_avx1_dec_64 L_AES_ECB_decrypt_avx1_done_64: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_decrypt_avx1_done_dec - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_decrypt_avx1_dec_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + vmovdqu xmm0, OWORD PTR [r12] ; aes_dec_block vpxor xmm0, xmm0, [r9] vmovdqu xmm5, OWORD PTR [r9+16] @@ -1586,29 +1610,33 @@ L_AES_ECB_decrypt_avx1_dec_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_decrypt_avx1_16_aes_dec_block_last: vaesdeclast xmm0, xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + vmovdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_decrypt_avx1_dec_16 L_AES_ECB_decrypt_avx1_done_dec: vmovdqu xmm6, OWORD PTR [rsp] add rsp, 16 + pop r13 + pop r12 ret AES_ECB_decrypt_avx1 ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_encrypt_avx1 PROC - mov rax, QWORD PTR [rsp+40] - mov r10d, DWORD PTR [rsp+48] + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + mov r10d, DWORD PTR [rsp+64] vmovdqu xmm0, OWORD PTR [r8] - xor eax, eax - cmp eax, r9d + xor r11d, r11d + cmp r11d, r9d je L_AES_CBC_encrypt_avx1_done L_AES_CBC_encrypt_avx1_loop: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm1, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r11] + vmovdqu xmm1, OWORD PTR [r12] vpxor xmm1, xmm1, xmm0 ; aes_enc_block vpxor xmm1, xmm1, [rax] @@ -1645,41 +1673,45 @@ L_AES_CBC_encrypt_avx1_loop: vmovdqu xmm3, OWORD PTR [rax+224] L_AES_CBC_encrypt_avx1_aes_enc_block_last: vaesenclast xmm1, xmm1, xmm3 - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm1 + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm1 vmovdqa xmm0, xmm1 - add eax, 16 - cmp eax, r9d + add r11d, 16 + cmp r11d, r9d jl L_AES_CBC_encrypt_avx1_loop L_AES_CBC_encrypt_avx1_done: vmovdqu OWORD PTR [r8], xmm0 + pop r13 + pop r12 ret AES_CBC_encrypt_avx1 ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_decrypt_avx1 PROC push r12 - mov rax, QWORD PTR [rsp+48] - mov r10d, DWORD PTR [rsp+56] + push r13 + push r14 + mov rax, QWORD PTR [rsp+64] + mov r10d, DWORD PTR [rsp+72] sub rsp, 48 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 vmovdqu OWORD PTR [rsp+32], xmm8 vmovdqu xmm4, OWORD PTR [r8] - xor eax, eax + xor r11d, r11d cmp r9d, 64 - mov r10d, r9d + mov r12d, r9d jl L_AES_CBC_decrypt_avx1_done_64 - and r10d, 4294967232 + and r12d, 4294967232 L_AES_CBC_decrypt_avx1_dec_64: ; 64 bytes of input ; aes_cbc_dec_64 - lea r11, QWORD PTR [rcx+rax] - lea r12, QWORD PTR [rdx+rax] - vmovdqu xmm0, OWORD PTR [r11] - vmovdqu xmm1, OWORD PTR [r11+16] - vmovdqu xmm2, OWORD PTR [r11+32] - vmovdqu xmm3, OWORD PTR [r11+48] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] + vmovdqu xmm0, OWORD PTR [r13] + vmovdqu xmm1, OWORD PTR [r13+16] + vmovdqu xmm2, OWORD PTR [r13+32] + vmovdqu xmm3, OWORD PTR [r13+48] ; aes_dec_block vmovdqu xmm5, OWORD PTR [rax] vpxor xmm0, xmm0, xmm5 @@ -1762,26 +1794,26 @@ L_AES_CBC_decrypt_avx1_64_aes_dec_block_last: vaesdeclast xmm2, xmm2, xmm5 vaesdeclast xmm3, xmm3, xmm5 vpxor xmm0, xmm0, xmm4 - vpxor xmm1, xmm1, [r11] - vpxor xmm2, xmm2, [r11+16] - vpxor xmm3, xmm3, [r11+32] - vmovdqu xmm4, OWORD PTR [r11+48] - vmovdqu OWORD PTR [r12], xmm0 - vmovdqu OWORD PTR [r12+16], xmm1 - vmovdqu OWORD PTR [r12+32], xmm2 - vmovdqu OWORD PTR [r12+48], xmm3 - add eax, 64 - cmp eax, r10d + vpxor xmm1, xmm1, [r13] + vpxor xmm2, xmm2, [r13+16] + vpxor xmm3, xmm3, [r13+32] + vmovdqu xmm4, OWORD PTR [r13+48] + vmovdqu OWORD PTR [r14], xmm0 + vmovdqu OWORD PTR [r14+16], xmm1 + vmovdqu OWORD PTR [r14+32], xmm2 + vmovdqu OWORD PTR [r14+48], xmm3 + add r11d, 64 + cmp r11d, r12d jl L_AES_CBC_decrypt_avx1_dec_64 L_AES_CBC_decrypt_avx1_done_64: - cmp eax, r9d - mov r10d, r9d + cmp r11d, r9d + mov r12d, r9d je L_AES_CBC_decrypt_avx1_done_dec - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CBC_decrypt_avx1_dec_16: ; 16 bytes of input - lea r11, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r11] + lea r13, QWORD PTR [rcx+r11] + vmovdqu xmm0, OWORD PTR [r13] vmovdqa xmm8, xmm0 ; aes_dec_block vpxor xmm0, xmm0, [rax] @@ -1820,10 +1852,10 @@ L_AES_CBC_decrypt_avx1_16_aes_dec_block_last: vaesdeclast xmm0, xmm0, xmm6 vpxor xmm0, xmm0, xmm4 vmovdqa xmm4, xmm8 - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CBC_decrypt_avx1_dec_16 L_AES_CBC_decrypt_avx1_done_dec: vmovdqu OWORD PTR [r8], xmm4 @@ -1831,27 +1863,29 @@ L_AES_CBC_decrypt_avx1_done_dec: vmovdqu xmm7, OWORD PTR [rsp+16] vmovdqu xmm8, OWORD PTR [rsp+32] add rsp, 48 + pop r14 + pop r13 pop r12 ret AES_CBC_decrypt_avx1 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_avx1_bswap QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_aes_ctr_avx1_bswap QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_aes_ctr_avx1_bswap QWORD L_aes_ctr_avx1_bswap _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_avx1_one QWORD \ - 0000000000000001h, 0000000000000000h +L_aes_ctr_avx1_one QWORD 0000000000000001h, 0000000000000000h ptr_L_aes_ctr_avx1_one QWORD L_aes_ctr_avx1_one _DATA ENDS _TEXT SEGMENT READONLY PARA AES_CTR_encrypt_avx1 PROC - push rbx - mov eax, DWORD PTR [rsp+48] - mov r10, QWORD PTR [rsp+56] + push r12 + push r13 + push r14 + mov eax, DWORD PTR [rsp+64] + mov r10, QWORD PTR [rsp+72] sub rsp, 96 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 @@ -1864,16 +1898,16 @@ AES_CTR_encrypt_avx1 PROC vpxor xmm10, xmm10, xmm10 vmovdqu xmm7, OWORD PTR [r10] vpshufb xmm7, xmm7, xmm8 - xor eax, eax + xor r11d, r11d cmp r8d, 64 - mov r10d, r8d + mov r12d, r8d jl L_AES_CTR_encrypt_avx1_done_64 - and r10d, 4294967232 + and r12d, 4294967232 L_AES_CTR_encrypt_avx1_enc_64: ; 64 bytes of input ; aes_ctr_enc_64 - lea r11, QWORD PTR [rcx+rax] - lea rbx, QWORD PTR [rdx+rax] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] vpshufb xmm0, xmm7, xmm8 vpaddq xmm7, xmm7, xmm9 vpcmpeqq xmm11, xmm7, xmm10 @@ -1979,22 +2013,22 @@ L_AES_CTR_encrypt_avx1_64_aes_enc_block_last: vaesenclast xmm1, xmm1, xmm4 vaesenclast xmm2, xmm2, xmm4 vaesenclast xmm3, xmm3, xmm4 - vpxor xmm0, xmm0, [r11] - vpxor xmm1, xmm1, [r11+16] - vpxor xmm2, xmm2, [r11+32] - vpxor xmm3, xmm3, [r11+48] - vmovdqu OWORD PTR [rbx], xmm0 - vmovdqu OWORD PTR [rbx+16], xmm1 - vmovdqu OWORD PTR [rbx+32], xmm2 - vmovdqu OWORD PTR [rbx+48], xmm3 - add eax, 64 - cmp eax, r10d + vpxor xmm0, xmm0, [r13] + vpxor xmm1, xmm1, [r13+16] + vpxor xmm2, xmm2, [r13+32] + vpxor xmm3, xmm3, [r13+48] + vmovdqu OWORD PTR [r14], xmm0 + vmovdqu OWORD PTR [r14+16], xmm1 + vmovdqu OWORD PTR [r14+32], xmm2 + vmovdqu OWORD PTR [r14+48], xmm3 + add r11d, 64 + cmp r11d, r12d jl L_AES_CTR_encrypt_avx1_enc_64 L_AES_CTR_encrypt_avx1_done_64: - cmp eax, r8d - mov r10d, r8d + cmp r11d, r8d + mov r12d, r8d je L_AES_CTR_encrypt_avx1_done_enc - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CTR_encrypt_avx1_enc_16: ; 16 bytes of input vpshufb xmm0, xmm7, xmm8 @@ -2038,12 +2072,12 @@ L_AES_CTR_encrypt_avx1_enc_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_CTR_encrypt_avx1_16_aes_enc_block_last: vaesenclast xmm0, xmm0, xmm5 - lea r11, QWORD PTR [rcx+rax] - vpxor xmm0, xmm0, [r11] - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rcx+r11] + vpxor xmm0, xmm0, [r13] + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CTR_encrypt_avx1_enc_16 L_AES_CTR_encrypt_avx1_done_enc: vpshufb xmm7, xmm7, xmm8 @@ -2055,7 +2089,9 @@ L_AES_CTR_encrypt_avx1_done_enc: vmovdqu xmm10, OWORD PTR [rsp+64] vmovdqu xmm11, OWORD PTR [rsp+80] add rsp, 96 - pop rbx + pop r14 + pop r13 + pop r12 ret AES_CTR_encrypt_avx1 ENDP _TEXT ENDS @@ -2063,172 +2099,174 @@ ENDIF IFDEF HAVE_INTEL_VAES _TEXT SEGMENT READONLY PARA AES_ECB_encrypt_vaes PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 32 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 - xor eax, eax + xor r10d, r10d cmp r8d, 128 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_encrypt_vaes_done_128 - and r9d, 4294967168 + and r11d, 4294967168 L_AES_ECB_encrypt_vaes_enc_128: ; 128 bytes of input ; aes_ecb_enc_128 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu ymm0, YMMWORD PTR [r10] - vmovdqu ymm1, YMMWORD PTR [r10+32] - vmovdqu ymm2, YMMWORD PTR [r10+64] - vmovdqu ymm3, YMMWORD PTR [r10+96] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu ymm0, YMMWORD PTR [r12] + vmovdqu ymm1, YMMWORD PTR [r12+32] + vmovdqu ymm2, YMMWORD PTR [r12+64] + vmovdqu ymm3, YMMWORD PTR [r12+96] ; aes_enc_block - vbroadcasti128 ymm7, [r9] + vbroadcasti128 ymm7, OWORD PTR [r9] vpxor ymm0, ymm0, ymm7 vpxor ymm1, ymm1, ymm7 vpxor ymm2, ymm2, ymm7 vpxor ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+16] + vbroadcasti128 ymm7, OWORD PTR [r9+16] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+32] + vbroadcasti128 ymm7, OWORD PTR [r9+32] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+48] + vbroadcasti128 ymm7, OWORD PTR [r9+48] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+64] + vbroadcasti128 ymm7, OWORD PTR [r9+64] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+80] + vbroadcasti128 ymm7, OWORD PTR [r9+80] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+96] + vbroadcasti128 ymm7, OWORD PTR [r9+96] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+112] + vbroadcasti128 ymm7, OWORD PTR [r9+112] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+128] + vbroadcasti128 ymm7, OWORD PTR [r9+128] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+144] + vbroadcasti128 ymm7, OWORD PTR [r9+144] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 cmp eax, 11 - vbroadcasti128 ymm7, [r9+160] + vbroadcasti128 ymm7, OWORD PTR [r9+160] jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+176] + vbroadcasti128 ymm7, OWORD PTR [r9+176] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 cmp eax, 13 - vbroadcasti128 ymm7, [r9+192] + vbroadcasti128 ymm7, OWORD PTR [r9+192] jl L_AES_ECB_encrypt_vaes_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+208] + vbroadcasti128 ymm7, OWORD PTR [r9+208] vaesenc ymm0, ymm0, ymm7 vaesenc ymm1, ymm1, ymm7 vaesenc ymm2, ymm2, ymm7 vaesenc ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+224] + vbroadcasti128 ymm7, OWORD PTR [r9+224] L_AES_ECB_encrypt_vaes_128_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm7 vaesenclast ymm1, ymm1, ymm7 vaesenclast ymm2, ymm2, ymm7 vaesenclast ymm3, ymm3, ymm7 - vmovdqu YMMWORD PTR [r11], ymm0 - vmovdqu YMMWORD PTR [r11+32], ymm1 - vmovdqu YMMWORD PTR [r11+64], ymm2 - vmovdqu YMMWORD PTR [r11+96], ymm3 - add eax, 128 - cmp eax, r9d + vmovdqu YMMWORD PTR [r13], ymm0 + vmovdqu YMMWORD PTR [r13+32], ymm1 + vmovdqu YMMWORD PTR [r13+64], ymm2 + vmovdqu YMMWORD PTR [r13+96], ymm3 + add r10d, 128 + cmp r10d, r11d jl L_AES_ECB_encrypt_vaes_enc_128 L_AES_ECB_encrypt_vaes_done_128: - mov r9d, r8d - and r9d, 4294967264 - cmp eax, r9d + mov r11d, r8d + and r11d, 4294967264 + cmp r10d, r11d je L_AES_ECB_encrypt_vaes_done_32 L_AES_ECB_encrypt_vaes_enc_32: ; 32 bytes of input ; aes_ecb_enc_32 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu ymm0, YMMWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu ymm0, YMMWORD PTR [r12] ; aes_enc_block - vbroadcasti128 ymm7, [r9] + vbroadcasti128 ymm7, OWORD PTR [r9] vpxor ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+16] + vbroadcasti128 ymm7, OWORD PTR [r9+16] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+32] + vbroadcasti128 ymm7, OWORD PTR [r9+32] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+48] + vbroadcasti128 ymm7, OWORD PTR [r9+48] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+64] + vbroadcasti128 ymm7, OWORD PTR [r9+64] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+80] + vbroadcasti128 ymm7, OWORD PTR [r9+80] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+96] + vbroadcasti128 ymm7, OWORD PTR [r9+96] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+112] + vbroadcasti128 ymm7, OWORD PTR [r9+112] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+128] + vbroadcasti128 ymm7, OWORD PTR [r9+128] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+144] + vbroadcasti128 ymm7, OWORD PTR [r9+144] vaesenc ymm0, ymm0, ymm7 cmp eax, 11 - vbroadcasti128 ymm7, [r9+160] + vbroadcasti128 ymm7, OWORD PTR [r9+160] jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+176] + vbroadcasti128 ymm7, OWORD PTR [r9+176] vaesenc ymm0, ymm0, ymm7 cmp eax, 13 - vbroadcasti128 ymm7, [r9+192] + vbroadcasti128 ymm7, OWORD PTR [r9+192] jl L_AES_ECB_encrypt_vaes_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+208] + vbroadcasti128 ymm7, OWORD PTR [r9+208] vaesenc ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+224] + vbroadcasti128 ymm7, OWORD PTR [r9+224] L_AES_ECB_encrypt_vaes_32_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm7 - vmovdqu YMMWORD PTR [r11], ymm0 - add eax, 32 - cmp eax, r9d + vmovdqu YMMWORD PTR [r13], ymm0 + add r10d, 32 + cmp r10d, r11d jl L_AES_ECB_encrypt_vaes_enc_32 L_AES_ECB_encrypt_vaes_done_32: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_encrypt_vaes_done_enc - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_encrypt_vaes_enc_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + vmovdqu xmm0, OWORD PTR [r12] ; aes_enc_block vpxor xmm0, xmm0, [r9] vmovdqu xmm5, OWORD PTR [r9+16] @@ -2264,186 +2302,190 @@ L_AES_ECB_encrypt_vaes_enc_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_encrypt_vaes_16_aes_enc_block_last: vaesenclast xmm0, xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + vmovdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_encrypt_vaes_enc_16 L_AES_ECB_encrypt_vaes_done_enc: vmovdqu xmm6, OWORD PTR [rsp] vmovdqu xmm7, OWORD PTR [rsp+16] add rsp, 32 + pop r13 + pop r12 ret AES_ECB_encrypt_vaes ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_ECB_decrypt_vaes PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 32 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 - xor eax, eax + xor r10d, r10d cmp r8d, 128 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_decrypt_vaes_done_128 - and r9d, 4294967168 + and r11d, 4294967168 L_AES_ECB_decrypt_vaes_dec_128: ; 128 bytes of input ; aes_ecb_dec_128 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu ymm0, YMMWORD PTR [r10] - vmovdqu ymm1, YMMWORD PTR [r10+32] - vmovdqu ymm2, YMMWORD PTR [r10+64] - vmovdqu ymm3, YMMWORD PTR [r10+96] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu ymm0, YMMWORD PTR [r12] + vmovdqu ymm1, YMMWORD PTR [r12+32] + vmovdqu ymm2, YMMWORD PTR [r12+64] + vmovdqu ymm3, YMMWORD PTR [r12+96] ; aes_dec_block - vbroadcasti128 ymm7, [r9] + vbroadcasti128 ymm7, OWORD PTR [r9] vpxor ymm0, ymm0, ymm7 vpxor ymm1, ymm1, ymm7 vpxor ymm2, ymm2, ymm7 vpxor ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+16] + vbroadcasti128 ymm7, OWORD PTR [r9+16] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+32] + vbroadcasti128 ymm7, OWORD PTR [r9+32] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+48] + vbroadcasti128 ymm7, OWORD PTR [r9+48] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+64] + vbroadcasti128 ymm7, OWORD PTR [r9+64] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+80] + vbroadcasti128 ymm7, OWORD PTR [r9+80] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+96] + vbroadcasti128 ymm7, OWORD PTR [r9+96] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+112] + vbroadcasti128 ymm7, OWORD PTR [r9+112] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+128] + vbroadcasti128 ymm7, OWORD PTR [r9+128] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+144] + vbroadcasti128 ymm7, OWORD PTR [r9+144] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 cmp eax, 11 - vbroadcasti128 ymm7, [r9+160] + vbroadcasti128 ymm7, OWORD PTR [r9+160] jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+176] + vbroadcasti128 ymm7, OWORD PTR [r9+176] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 cmp eax, 13 - vbroadcasti128 ymm7, [r9+192] + vbroadcasti128 ymm7, OWORD PTR [r9+192] jl L_AES_ECB_decrypt_vaes_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+208] + vbroadcasti128 ymm7, OWORD PTR [r9+208] vaesdec ymm0, ymm0, ymm7 vaesdec ymm1, ymm1, ymm7 vaesdec ymm2, ymm2, ymm7 vaesdec ymm3, ymm3, ymm7 - vbroadcasti128 ymm7, [r9+224] + vbroadcasti128 ymm7, OWORD PTR [r9+224] L_AES_ECB_decrypt_vaes_128_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm7 vaesdeclast ymm1, ymm1, ymm7 vaesdeclast ymm2, ymm2, ymm7 vaesdeclast ymm3, ymm3, ymm7 - vmovdqu YMMWORD PTR [r11], ymm0 - vmovdqu YMMWORD PTR [r11+32], ymm1 - vmovdqu YMMWORD PTR [r11+64], ymm2 - vmovdqu YMMWORD PTR [r11+96], ymm3 - add eax, 128 - cmp eax, r9d + vmovdqu YMMWORD PTR [r13], ymm0 + vmovdqu YMMWORD PTR [r13+32], ymm1 + vmovdqu YMMWORD PTR [r13+64], ymm2 + vmovdqu YMMWORD PTR [r13+96], ymm3 + add r10d, 128 + cmp r10d, r11d jl L_AES_ECB_decrypt_vaes_dec_128 L_AES_ECB_decrypt_vaes_done_128: - mov r9d, r8d - and r9d, 4294967264 - cmp eax, r9d + mov r11d, r8d + and r11d, 4294967264 + cmp r10d, r11d je L_AES_ECB_decrypt_vaes_done_32 L_AES_ECB_decrypt_vaes_dec_32: ; 32 bytes of input ; aes_ecb_dec_32 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu ymm0, YMMWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu ymm0, YMMWORD PTR [r12] ; aes_dec_block - vbroadcasti128 ymm7, [r9] + vbroadcasti128 ymm7, OWORD PTR [r9] vpxor ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+16] + vbroadcasti128 ymm7, OWORD PTR [r9+16] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+32] + vbroadcasti128 ymm7, OWORD PTR [r9+32] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+48] + vbroadcasti128 ymm7, OWORD PTR [r9+48] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+64] + vbroadcasti128 ymm7, OWORD PTR [r9+64] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+80] + vbroadcasti128 ymm7, OWORD PTR [r9+80] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+96] + vbroadcasti128 ymm7, OWORD PTR [r9+96] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+112] + vbroadcasti128 ymm7, OWORD PTR [r9+112] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+128] + vbroadcasti128 ymm7, OWORD PTR [r9+128] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+144] + vbroadcasti128 ymm7, OWORD PTR [r9+144] vaesdec ymm0, ymm0, ymm7 cmp eax, 11 - vbroadcasti128 ymm7, [r9+160] + vbroadcasti128 ymm7, OWORD PTR [r9+160] jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+176] + vbroadcasti128 ymm7, OWORD PTR [r9+176] vaesdec ymm0, ymm0, ymm7 cmp eax, 13 - vbroadcasti128 ymm7, [r9+192] + vbroadcasti128 ymm7, OWORD PTR [r9+192] jl L_AES_ECB_decrypt_vaes_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+208] + vbroadcasti128 ymm7, OWORD PTR [r9+208] vaesdec ymm0, ymm0, ymm7 - vbroadcasti128 ymm7, [r9+224] + vbroadcasti128 ymm7, OWORD PTR [r9+224] L_AES_ECB_decrypt_vaes_32_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm7 - vmovdqu YMMWORD PTR [r11], ymm0 - add eax, 32 - cmp eax, r9d + vmovdqu YMMWORD PTR [r13], ymm0 + add r10d, 32 + cmp r10d, r11d jl L_AES_ECB_decrypt_vaes_dec_32 L_AES_ECB_decrypt_vaes_done_32: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_decrypt_vaes_done_dec - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_decrypt_vaes_dec_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + vmovdqu xmm0, OWORD PTR [r12] ; aes_dec_block vpxor xmm0, xmm0, [r9] vmovdqu xmm5, OWORD PTR [r9+16] @@ -2479,30 +2521,34 @@ L_AES_ECB_decrypt_vaes_dec_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_decrypt_vaes_16_aes_dec_block_last: vaesdeclast xmm0, xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + vmovdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_decrypt_vaes_dec_16 L_AES_ECB_decrypt_vaes_done_dec: vmovdqu xmm6, OWORD PTR [rsp] vmovdqu xmm7, OWORD PTR [rsp+16] add rsp, 32 + pop r13 + pop r12 ret AES_ECB_decrypt_vaes ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_encrypt_vaes PROC - mov rax, QWORD PTR [rsp+40] - mov r10d, DWORD PTR [rsp+48] + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + mov r10d, DWORD PTR [rsp+64] vmovdqu xmm0, OWORD PTR [r8] - xor eax, eax - cmp eax, r9d + xor r11d, r11d + cmp r11d, r9d je L_AES_CBC_encrypt_vaes_done L_AES_CBC_encrypt_vaes_loop: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm1, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r11] + vmovdqu xmm1, OWORD PTR [r12] vpxor xmm1, xmm1, xmm0 ; aes_enc_block vpxor xmm1, xmm1, [rax] @@ -2539,22 +2585,26 @@ L_AES_CBC_encrypt_vaes_loop: vmovdqu xmm3, OWORD PTR [rax+224] L_AES_CBC_encrypt_vaes_aes_enc_block_last: vaesenclast xmm1, xmm1, xmm3 - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm1 + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm1 vmovdqa xmm0, xmm1 - add eax, 16 - cmp eax, r9d + add r11d, 16 + cmp r11d, r9d jl L_AES_CBC_encrypt_vaes_loop L_AES_CBC_encrypt_vaes_done: vmovdqu OWORD PTR [r8], xmm0 + pop r13 + pop r12 ret AES_CBC_encrypt_vaes ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_decrypt_vaes PROC push r12 - mov rax, QWORD PTR [rsp+48] - mov r10d, DWORD PTR [rsp+56] + push r13 + push r14 + mov rax, QWORD PTR [rsp+64] + mov r10d, DWORD PTR [rsp+72] sub rsp, 128 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 @@ -2565,101 +2615,101 @@ AES_CBC_decrypt_vaes PROC vmovdqu OWORD PTR [rsp+96], xmm12 vmovdqu OWORD PTR [rsp+112], xmm13 vmovdqu xmm8, OWORD PTR [r8] - xor eax, eax + xor r11d, r11d cmp r9d, 128 - mov r10d, r9d + mov r12d, r9d jl L_AES_CBC_decrypt_vaes_done_128 - and r10d, 4294967168 + and r12d, 4294967168 L_AES_CBC_decrypt_vaes_dec_128: ; 128 bytes of input ; aes_cbc_dec_128 - lea r11, QWORD PTR [rcx+rax] - lea r12, QWORD PTR [rdx+rax] - vmovdqu ymm0, YMMWORD PTR [r11] - vmovdqu ymm1, YMMWORD PTR [r11+32] - vmovdqu ymm2, YMMWORD PTR [r11+64] - vmovdqu ymm3, YMMWORD PTR [r11+96] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] + vmovdqu ymm0, YMMWORD PTR [r13] + vmovdqu ymm1, YMMWORD PTR [r13+32] + vmovdqu ymm2, YMMWORD PTR [r13+64] + vmovdqu ymm3, YMMWORD PTR [r13+96] vinserti128 ymm10, ymm8, xmm0, 1 - vmovdqu ymm11, YMMWORD PTR [r11+16] - vmovdqu ymm12, YMMWORD PTR [r11+48] - vmovdqu ymm13, YMMWORD PTR [r11+80] + vmovdqu ymm11, YMMWORD PTR [r13+16] + vmovdqu ymm12, YMMWORD PTR [r13+48] + vmovdqu ymm13, YMMWORD PTR [r13+80] vextracti128 xmm8, ymm3, 1 ; aes_dec_block - vbroadcasti128 ymm9, [rax] + vbroadcasti128 ymm9, OWORD PTR [rax] vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm9 vpxor ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+16] + vbroadcasti128 ymm9, OWORD PTR [rax+16] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+32] + vbroadcasti128 ymm9, OWORD PTR [rax+32] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+48] + vbroadcasti128 ymm9, OWORD PTR [rax+48] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+64] + vbroadcasti128 ymm9, OWORD PTR [rax+64] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+80] + vbroadcasti128 ymm9, OWORD PTR [rax+80] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+96] + vbroadcasti128 ymm9, OWORD PTR [rax+96] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+112] + vbroadcasti128 ymm9, OWORD PTR [rax+112] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+128] + vbroadcasti128 ymm9, OWORD PTR [rax+128] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+144] + vbroadcasti128 ymm9, OWORD PTR [rax+144] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [rax+160] + vbroadcasti128 ymm9, OWORD PTR [rax+160] jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+176] + vbroadcasti128 ymm9, OWORD PTR [rax+176] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [rax+192] + vbroadcasti128 ymm9, OWORD PTR [rax+192] jl L_AES_CBC_decrypt_vaes_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+208] + vbroadcasti128 ymm9, OWORD PTR [rax+208] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [rax+224] + vbroadcasti128 ymm9, OWORD PTR [rax+224] L_AES_CBC_decrypt_vaes_128_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vaesdeclast ymm1, ymm1, ymm9 @@ -2669,76 +2719,76 @@ L_AES_CBC_decrypt_vaes_128_aes_dec_block_last: vpxor ymm1, ymm1, ymm11 vpxor ymm2, ymm2, ymm12 vpxor ymm3, ymm3, ymm13 - vmovdqu YMMWORD PTR [r12], ymm0 - vmovdqu YMMWORD PTR [r12+32], ymm1 - vmovdqu YMMWORD PTR [r12+64], ymm2 - vmovdqu YMMWORD PTR [r12+96], ymm3 - add eax, 128 - cmp eax, r10d + vmovdqu YMMWORD PTR [r14], ymm0 + vmovdqu YMMWORD PTR [r14+32], ymm1 + vmovdqu YMMWORD PTR [r14+64], ymm2 + vmovdqu YMMWORD PTR [r14+96], ymm3 + add r11d, 128 + cmp r11d, r12d jl L_AES_CBC_decrypt_vaes_dec_128 L_AES_CBC_decrypt_vaes_done_128: - mov r10d, r9d - and r10d, 4294967264 - cmp eax, r10d + mov r12d, r9d + and r12d, 4294967264 + cmp r11d, r12d je L_AES_CBC_decrypt_vaes_done_32 L_AES_CBC_decrypt_vaes_dec_32: ; 32 bytes of input ; aes_cbc_dec_32 - lea r11, QWORD PTR [rcx+rax] - lea r12, QWORD PTR [rdx+rax] - vmovdqu ymm0, YMMWORD PTR [r11] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] + vmovdqu ymm0, YMMWORD PTR [r13] vinserti128 ymm10, ymm8, xmm0, 1 vextracti128 xmm8, ymm0, 1 ; aes_dec_block - vbroadcasti128 ymm9, [rax] + vbroadcasti128 ymm9, OWORD PTR [rax] vpxor ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+16] + vbroadcasti128 ymm9, OWORD PTR [rax+16] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+32] + vbroadcasti128 ymm9, OWORD PTR [rax+32] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+48] + vbroadcasti128 ymm9, OWORD PTR [rax+48] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+64] + vbroadcasti128 ymm9, OWORD PTR [rax+64] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+80] + vbroadcasti128 ymm9, OWORD PTR [rax+80] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+96] + vbroadcasti128 ymm9, OWORD PTR [rax+96] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+112] + vbroadcasti128 ymm9, OWORD PTR [rax+112] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+128] + vbroadcasti128 ymm9, OWORD PTR [rax+128] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+144] + vbroadcasti128 ymm9, OWORD PTR [rax+144] vaesdec ymm0, ymm0, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [rax+160] + vbroadcasti128 ymm9, OWORD PTR [rax+160] jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+176] + vbroadcasti128 ymm9, OWORD PTR [rax+176] vaesdec ymm0, ymm0, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [rax+192] + vbroadcasti128 ymm9, OWORD PTR [rax+192] jl L_AES_CBC_decrypt_vaes_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+208] + vbroadcasti128 ymm9, OWORD PTR [rax+208] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [rax+224] + vbroadcasti128 ymm9, OWORD PTR [rax+224] L_AES_CBC_decrypt_vaes_32_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vpxor ymm0, ymm0, ymm10 - vmovdqu YMMWORD PTR [r12], ymm0 - add eax, 32 - cmp eax, r10d + vmovdqu YMMWORD PTR [r14], ymm0 + add r11d, 32 + cmp r11d, r12d jl L_AES_CBC_decrypt_vaes_dec_32 L_AES_CBC_decrypt_vaes_done_32: - cmp eax, r9d - mov r10d, r9d + cmp r11d, r9d + mov r12d, r9d je L_AES_CBC_decrypt_vaes_done_dec - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CBC_decrypt_vaes_dec_16: ; 16 bytes of input - lea r11, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r11] + lea r13, QWORD PTR [rcx+r11] + vmovdqu xmm0, OWORD PTR [r13] vmovdqa xmm7, xmm0 ; aes_dec_block vpxor xmm0, xmm0, [rax] @@ -2777,10 +2827,10 @@ L_AES_CBC_decrypt_vaes_16_aes_dec_block_last: vaesdeclast xmm0, xmm0, xmm5 vpxor xmm0, xmm0, xmm8 vmovdqa xmm8, xmm7 - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CBC_decrypt_vaes_dec_16 L_AES_CBC_decrypt_vaes_done_dec: vmovdqu OWORD PTR [r8], xmm8 @@ -2793,43 +2843,45 @@ L_AES_CBC_decrypt_vaes_done_dec: vmovdqu xmm12, OWORD PTR [rsp+96] vmovdqu xmm13, OWORD PTR [rsp+112] add rsp, 128 + pop r14 + pop r13 pop r12 ret AES_CBC_decrypt_vaes ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_bswap_vaes QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_aes_ctr_bswap_vaes QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_aes_ctr_bswap_vaes QWORD L_aes_ctr_bswap_vaes _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_inc_vaes QWORD \ - 0000000000000000h, 0000000000000000h, - 0000000000000001h, 0000000000000000h, - 0000000000000002h, 0000000000000000h, - 0000000000000003h, 0000000000000000h, - 0000000000000004h, 0000000000000000h, - 0000000000000005h, 0000000000000000h, - 0000000000000006h, 0000000000000000h, - 0000000000000007h, 0000000000000000h, - 0000000000000008h, 0000000000000000h, - 0000000000000009h, 0000000000000000h, - 000000000000000ah, 0000000000000000h, - 000000000000000bh, 0000000000000000h, - 000000000000000ch, 0000000000000000h, - 000000000000000dh, 0000000000000000h, - 000000000000000eh, 0000000000000000h, - 000000000000000fh, 0000000000000000h, - 0000000000000010h, 0000000000000000h +L_aes_ctr_inc_vaes QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000001h, 0000000000000000h + QWORD 0000000000000002h, 0000000000000000h + QWORD 0000000000000003h, 0000000000000000h + QWORD 0000000000000004h, 0000000000000000h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000000000008h, 0000000000000000h + QWORD 0000000000000009h, 0000000000000000h + QWORD 000000000000000ah, 0000000000000000h + QWORD 000000000000000bh, 0000000000000000h + QWORD 000000000000000ch, 0000000000000000h + QWORD 000000000000000dh, 0000000000000000h + QWORD 000000000000000eh, 0000000000000000h + QWORD 000000000000000fh, 0000000000000000h + QWORD 0000000000000010h, 0000000000000000h ptr_L_aes_ctr_inc_vaes QWORD L_aes_ctr_inc_vaes _DATA ENDS _TEXT SEGMENT READONLY PARA AES_CTR_encrypt_vaes PROC - push rbx - mov eax, DWORD PTR [rsp+48] - mov r10, QWORD PTR [rsp+56] + push r12 + push r13 + push r14 + mov eax, DWORD PTR [rsp+64] + mov r10, QWORD PTR [rsp+72] sub rsp, 144 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 @@ -2841,16 +2893,16 @@ AES_CTR_encrypt_vaes PROC vmovdqu OWORD PTR [rsp+112], xmm13 vmovdqu OWORD PTR [rsp+128], xmm14 vbroadcasti128 ymm8, ptr_L_aes_ctr_bswap_vaes - vbroadcasti128 ymm7, [r10] + vbroadcasti128 ymm7, OWORD PTR [r10] vpshufb ymm7, ymm7, ymm8 - vbroadcasti128 ymm10, [ptr_L_aes_ctr_inc_vaes+128] - vbroadcasti128 ymm11, [ptr_L_aes_ctr_inc_vaes+32] - vbroadcasti128 ymm12, [ptr_L_aes_ctr_inc_vaes+16] - xor eax, eax + vbroadcasti128 ymm10, OWORD PTR [ptr_L_aes_ctr_inc_vaes+128] + vbroadcasti128 ymm11, OWORD PTR [ptr_L_aes_ctr_inc_vaes+32] + vbroadcasti128 ymm12, OWORD PTR [ptr_L_aes_ctr_inc_vaes+16] + xor r11d, r11d cmp r8d, 128 - mov r10d, r8d + mov r12d, r8d jl L_AES_CTR_encrypt_vaes_done_128 - and r10d, 4294967168 + and r12d, 4294967168 vmovdqa ymm9, ymm7 vpaddq ymm4, ymm7, [ptr_L_aes_ctr_inc_vaes] vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes] @@ -2889,8 +2941,8 @@ AES_CTR_encrypt_vaes PROC vpaddq ymm7, ymm7, ymm9 L_AES_CTR_encrypt_vaes_enc_128: ; 128 bytes of input - lea r11, QWORD PTR [rcx+rax] - lea rbx, QWORD PTR [rdx+rax] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] vpshufb ymm0, ymm4, ymm8 vpshufb ymm1, ymm5, ymm8 vpshufb ymm2, ymm6, ymm8 @@ -2932,108 +2984,108 @@ L_AES_CTR_encrypt_vaes_enc_128: vpslldq ymm9, ymm9, 8 vpaddq ymm7, ymm7, ymm9 ; aes_enc_block - vbroadcasti128 ymm13, [r9] + vbroadcasti128 ymm13, OWORD PTR [r9] vpxor ymm0, ymm0, ymm13 vpxor ymm1, ymm1, ymm13 vpxor ymm2, ymm2, ymm13 vpxor ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+16] + vbroadcasti128 ymm13, OWORD PTR [r9+16] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+32] + vbroadcasti128 ymm13, OWORD PTR [r9+32] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+48] + vbroadcasti128 ymm13, OWORD PTR [r9+48] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+64] + vbroadcasti128 ymm13, OWORD PTR [r9+64] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+80] + vbroadcasti128 ymm13, OWORD PTR [r9+80] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+96] + vbroadcasti128 ymm13, OWORD PTR [r9+96] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+112] + vbroadcasti128 ymm13, OWORD PTR [r9+112] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+128] + vbroadcasti128 ymm13, OWORD PTR [r9+128] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+144] + vbroadcasti128 ymm13, OWORD PTR [r9+144] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 cmp eax, 11 - vbroadcasti128 ymm13, [r9+160] + vbroadcasti128 ymm13, OWORD PTR [r9+160] jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+176] + vbroadcasti128 ymm13, OWORD PTR [r9+176] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 cmp eax, 13 - vbroadcasti128 ymm13, [r9+192] + vbroadcasti128 ymm13, OWORD PTR [r9+192] jl L_AES_CTR_encrypt_vaes_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+208] + vbroadcasti128 ymm13, OWORD PTR [r9+208] vaesenc ymm0, ymm0, ymm13 vaesenc ymm1, ymm1, ymm13 vaesenc ymm2, ymm2, ymm13 vaesenc ymm3, ymm3, ymm13 - vbroadcasti128 ymm13, [r9+224] + vbroadcasti128 ymm13, OWORD PTR [r9+224] L_AES_CTR_encrypt_vaes_128_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm13 vaesenclast ymm1, ymm1, ymm13 vaesenclast ymm2, ymm2, ymm13 vaesenclast ymm3, ymm3, ymm13 - vpxor ymm0, ymm0, [r11] - vpxor ymm1, ymm1, [r11+32] - vpxor ymm2, ymm2, [r11+64] - vpxor ymm3, ymm3, [r11+96] - vmovdqu YMMWORD PTR [rbx], ymm0 - vmovdqu YMMWORD PTR [rbx+32], ymm1 - vmovdqu YMMWORD PTR [rbx+64], ymm2 - vmovdqu YMMWORD PTR [rbx+96], ymm3 - add eax, 128 - cmp eax, r10d + vpxor ymm0, ymm0, [r13] + vpxor ymm1, ymm1, [r13+32] + vpxor ymm2, ymm2, [r13+64] + vpxor ymm3, ymm3, [r13+96] + vmovdqu YMMWORD PTR [r14], ymm0 + vmovdqu YMMWORD PTR [r14+32], ymm1 + vmovdqu YMMWORD PTR [r14+64], ymm2 + vmovdqu YMMWORD PTR [r14+96], ymm3 + add r11d, 128 + cmp r11d, r12d jl L_AES_CTR_encrypt_vaes_enc_128 vperm2i128 ymm7, ymm4, ymm4, 0 L_AES_CTR_encrypt_vaes_done_128: - mov r10d, r8d - and r10d, 4294967264 - cmp eax, r10d + mov r12d, r8d + and r12d, 4294967264 + cmp r11d, r12d je L_AES_CTR_encrypt_vaes_done_32 L_AES_CTR_encrypt_vaes_enc_32: ; 32 bytes of input ; aes_ctr_enc_32 - lea r11, QWORD PTR [rcx+rax] - lea rbx, QWORD PTR [rdx+rax] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] vpaddq ymm0, ymm7, [ptr_L_aes_ctr_inc_vaes] vmovdqa ymm9, ymm7 vpand ymm14, ymm9, [ptr_L_aes_ctr_inc_vaes] @@ -3054,51 +3106,51 @@ L_AES_CTR_encrypt_vaes_enc_32: vpslldq ymm9, ymm9, 8 vpaddq ymm7, ymm7, ymm9 ; aes_enc_block - vbroadcasti128 ymm13, [r9] + vbroadcasti128 ymm13, OWORD PTR [r9] vpxor ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+16] + vbroadcasti128 ymm13, OWORD PTR [r9+16] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+32] + vbroadcasti128 ymm13, OWORD PTR [r9+32] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+48] + vbroadcasti128 ymm13, OWORD PTR [r9+48] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+64] + vbroadcasti128 ymm13, OWORD PTR [r9+64] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+80] + vbroadcasti128 ymm13, OWORD PTR [r9+80] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+96] + vbroadcasti128 ymm13, OWORD PTR [r9+96] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+112] + vbroadcasti128 ymm13, OWORD PTR [r9+112] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+128] + vbroadcasti128 ymm13, OWORD PTR [r9+128] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+144] + vbroadcasti128 ymm13, OWORD PTR [r9+144] vaesenc ymm0, ymm0, ymm13 cmp eax, 11 - vbroadcasti128 ymm13, [r9+160] + vbroadcasti128 ymm13, OWORD PTR [r9+160] jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+176] + vbroadcasti128 ymm13, OWORD PTR [r9+176] vaesenc ymm0, ymm0, ymm13 cmp eax, 13 - vbroadcasti128 ymm13, [r9+192] + vbroadcasti128 ymm13, OWORD PTR [r9+192] jl L_AES_CTR_encrypt_vaes_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+208] + vbroadcasti128 ymm13, OWORD PTR [r9+208] vaesenc ymm0, ymm0, ymm13 - vbroadcasti128 ymm13, [r9+224] + vbroadcasti128 ymm13, OWORD PTR [r9+224] L_AES_CTR_encrypt_vaes_32_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm13 - vpxor ymm0, ymm0, [r11] - vmovdqu YMMWORD PTR [rbx], ymm0 - add eax, 32 - cmp eax, r10d + vpxor ymm0, ymm0, [r13] + vmovdqu YMMWORD PTR [r14], ymm0 + add r11d, 32 + cmp r11d, r12d jl L_AES_CTR_encrypt_vaes_enc_32 L_AES_CTR_encrypt_vaes_done_32: - cmp eax, r8d - mov r10d, r8d + cmp r11d, r8d + mov r12d, r8d je L_AES_CTR_encrypt_vaes_done_enc - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CTR_encrypt_vaes_enc_16: ; 16 bytes of input vpshufb xmm0, xmm7, xmm8 @@ -3146,12 +3198,12 @@ L_AES_CTR_encrypt_vaes_enc_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_CTR_encrypt_vaes_16_aes_enc_block_last: vaesenclast xmm0, xmm0, xmm5 - lea r11, QWORD PTR [rcx+rax] - vpxor xmm0, xmm0, [r11] - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rcx+r11] + vpxor xmm0, xmm0, [r13] + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CTR_encrypt_vaes_enc_16 L_AES_CTR_encrypt_vaes_done_enc: vpshufb xmm0, xmm7, xmm8 @@ -3166,7 +3218,9 @@ L_AES_CTR_encrypt_vaes_done_enc: vmovdqu xmm13, OWORD PTR [rsp+112] vmovdqu xmm14, OWORD PTR [rsp+128] add rsp, 144 - pop rbx + pop r14 + pop r13 + pop r12 ret AES_CTR_encrypt_vaes ENDP _TEXT ENDS @@ -3174,7 +3228,9 @@ ENDIF IFDEF HAVE_INTEL_AVX512 _TEXT SEGMENT READONLY PARA AES_ECB_encrypt_avx512 PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 160 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 @@ -3186,42 +3242,42 @@ AES_ECB_encrypt_avx512 PROC vmovdqu OWORD PTR [rsp+112], xmm13 vmovdqu OWORD PTR [rsp+128], xmm14 vmovdqu OWORD PTR [rsp+144], xmm15 - xor eax, eax + xor r10d, r10d cmp r8d, 64 jl L_AES_ECB_encrypt_avx512_done_64 - vbroadcasti32x4 zmm8, [r9] - vbroadcasti32x4 zmm9, [r9+16] - vbroadcasti32x4 zmm10, [r9+32] - vbroadcasti32x4 zmm11, [r9+48] - vbroadcasti32x4 zmm12, [r9+64] - vbroadcasti32x4 zmm13, [r9+80] - vbroadcasti32x4 zmm14, [r9+96] - vbroadcasti32x4 zmm15, [r9+112] - vbroadcasti32x4 zmm16, [r9+128] - vbroadcasti32x4 zmm17, [r9+144] - vbroadcasti32x4 zmm18, [r9+160] + vbroadcasti32x4 zmm8, OWORD PTR [r9] + vbroadcasti32x4 zmm9, OWORD PTR [r9+16] + vbroadcasti32x4 zmm10, OWORD PTR [r9+32] + vbroadcasti32x4 zmm11, OWORD PTR [r9+48] + vbroadcasti32x4 zmm12, OWORD PTR [r9+64] + vbroadcasti32x4 zmm13, OWORD PTR [r9+80] + vbroadcasti32x4 zmm14, OWORD PTR [r9+96] + vbroadcasti32x4 zmm15, OWORD PTR [r9+112] + vbroadcasti32x4 zmm16, OWORD PTR [r9+128] + vbroadcasti32x4 zmm17, OWORD PTR [r9+144] + vbroadcasti32x4 zmm18, OWORD PTR [r9+160] cmp eax, 11 jl L_AES_ECB_encrypt_avx512_key_cached - vbroadcasti32x4 zmm19, [r9+176] - vbroadcasti32x4 zmm20, [r9+192] + vbroadcasti32x4 zmm19, OWORD PTR [r9+176] + vbroadcasti32x4 zmm20, OWORD PTR [r9+192] cmp eax, 13 jl L_AES_ECB_encrypt_avx512_key_cached - vbroadcasti32x4 zmm21, [r9+208] - vbroadcasti32x4 zmm22, [r9+224] + vbroadcasti32x4 zmm21, OWORD PTR [r9+208] + vbroadcasti32x4 zmm22, OWORD PTR [r9+224] L_AES_ECB_encrypt_avx512_key_cached: cmp r8d, 256 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_encrypt_avx512_done_256 - and r9d, 4294967040 + and r11d, 4294967040 L_AES_ECB_encrypt_avx512_enc_256: ; 256 bytes of input ; aes_ecb_enc_256 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu64 zmm0, [r10] - vmovdqu64 zmm1, [r10+64] - vmovdqu64 zmm2, [r10+128] - vmovdqu64 zmm3, [r10+192] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu64 zmm0, [r12] + vmovdqu64 zmm1, [r12+64] + vmovdqu64 zmm2, [r12+128] + vmovdqu64 zmm3, [r12+192] ; aes_enc_block vpxorq zmm0, zmm0, zmm8 vpxorq zmm1, zmm1, zmm8 @@ -3291,24 +3347,24 @@ L_AES_ECB_encrypt_avx512_256_aes_enc_block_last: vaesenclast zmm1, zmm1, zmm7 vaesenclast zmm2, zmm2, zmm7 vaesenclast zmm3, zmm3, zmm7 - vmovdqu64 [r11], zmm0 - vmovdqu64 [r11+64], zmm1 - vmovdqu64 [r11+128], zmm2 - vmovdqu64 [r11+192], zmm3 - add eax, 256 - cmp eax, r9d + vmovdqu64 [r13], zmm0 + vmovdqu64 [r13+64], zmm1 + vmovdqu64 [r13+128], zmm2 + vmovdqu64 [r13+192], zmm3 + add r10d, 256 + cmp r10d, r11d jl L_AES_ECB_encrypt_avx512_enc_256 L_AES_ECB_encrypt_avx512_done_256: - mov r9d, r8d - and r9d, 4294967232 - cmp eax, r9d + mov r11d, r8d + and r11d, 4294967232 + cmp r10d, r11d je L_AES_ECB_encrypt_avx512_done_64 L_AES_ECB_encrypt_avx512_enc_64: ; 64 bytes of input ; aes_ecb_enc_64 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu64 zmm0, [r10] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu64 zmm0, [r12] ; aes_enc_block vpxorq zmm0, zmm0, zmm8 vaesenc zmm0, zmm0, zmm9 @@ -3333,19 +3389,19 @@ L_AES_ECB_encrypt_avx512_enc_64: vmovdqa64 zmm7, zmm22 L_AES_ECB_encrypt_avx512_64_aes_enc_block_last: vaesenclast zmm0, zmm0, zmm7 - vmovdqu64 [r11], zmm0 - add eax, 64 - cmp eax, r9d + vmovdqu64 [r13], zmm0 + add r10d, 64 + cmp r10d, r11d jl L_AES_ECB_encrypt_avx512_enc_64 L_AES_ECB_encrypt_avx512_done_64: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_encrypt_avx512_done_enc - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_encrypt_avx512_enc_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + vmovdqu xmm0, OWORD PTR [r12] ; aes_enc_block vpxor xmm0, xmm0, [r9] vmovdqu xmm5, OWORD PTR [r9+16] @@ -3381,10 +3437,10 @@ L_AES_ECB_encrypt_avx512_enc_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_encrypt_avx512_16_aes_enc_block_last: vaesenclast xmm0, xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + vmovdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_encrypt_avx512_enc_16 L_AES_ECB_encrypt_avx512_done_enc: vmovdqu xmm6, OWORD PTR [rsp] @@ -3398,12 +3454,16 @@ L_AES_ECB_encrypt_avx512_done_enc: vmovdqu xmm14, OWORD PTR [rsp+128] vmovdqu xmm15, OWORD PTR [rsp+144] add rsp, 160 + pop r13 + pop r12 ret AES_ECB_encrypt_avx512 ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_ECB_decrypt_avx512 PROC - mov eax, DWORD PTR [rsp+40] + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] sub rsp, 160 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 @@ -3415,42 +3475,42 @@ AES_ECB_decrypt_avx512 PROC vmovdqu OWORD PTR [rsp+112], xmm13 vmovdqu OWORD PTR [rsp+128], xmm14 vmovdqu OWORD PTR [rsp+144], xmm15 - xor eax, eax + xor r10d, r10d cmp r8d, 64 jl L_AES_ECB_decrypt_avx512_done_64 - vbroadcasti32x4 zmm8, [r9] - vbroadcasti32x4 zmm9, [r9+16] - vbroadcasti32x4 zmm10, [r9+32] - vbroadcasti32x4 zmm11, [r9+48] - vbroadcasti32x4 zmm12, [r9+64] - vbroadcasti32x4 zmm13, [r9+80] - vbroadcasti32x4 zmm14, [r9+96] - vbroadcasti32x4 zmm15, [r9+112] - vbroadcasti32x4 zmm16, [r9+128] - vbroadcasti32x4 zmm17, [r9+144] - vbroadcasti32x4 zmm18, [r9+160] + vbroadcasti32x4 zmm8, OWORD PTR [r9] + vbroadcasti32x4 zmm9, OWORD PTR [r9+16] + vbroadcasti32x4 zmm10, OWORD PTR [r9+32] + vbroadcasti32x4 zmm11, OWORD PTR [r9+48] + vbroadcasti32x4 zmm12, OWORD PTR [r9+64] + vbroadcasti32x4 zmm13, OWORD PTR [r9+80] + vbroadcasti32x4 zmm14, OWORD PTR [r9+96] + vbroadcasti32x4 zmm15, OWORD PTR [r9+112] + vbroadcasti32x4 zmm16, OWORD PTR [r9+128] + vbroadcasti32x4 zmm17, OWORD PTR [r9+144] + vbroadcasti32x4 zmm18, OWORD PTR [r9+160] cmp eax, 11 jl L_AES_ECB_decrypt_avx512_key_cached - vbroadcasti32x4 zmm19, [r9+176] - vbroadcasti32x4 zmm20, [r9+192] + vbroadcasti32x4 zmm19, OWORD PTR [r9+176] + vbroadcasti32x4 zmm20, OWORD PTR [r9+192] cmp eax, 13 jl L_AES_ECB_decrypt_avx512_key_cached - vbroadcasti32x4 zmm21, [r9+208] - vbroadcasti32x4 zmm22, [r9+224] + vbroadcasti32x4 zmm21, OWORD PTR [r9+208] + vbroadcasti32x4 zmm22, OWORD PTR [r9+224] L_AES_ECB_decrypt_avx512_key_cached: cmp r8d, 256 - mov r9d, r8d + mov r11d, r8d jl L_AES_ECB_decrypt_avx512_done_256 - and r9d, 4294967040 + and r11d, 4294967040 L_AES_ECB_decrypt_avx512_dec_256: ; 256 bytes of input ; aes_ecb_dec_256 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu64 zmm0, [r10] - vmovdqu64 zmm1, [r10+64] - vmovdqu64 zmm2, [r10+128] - vmovdqu64 zmm3, [r10+192] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu64 zmm0, [r12] + vmovdqu64 zmm1, [r12+64] + vmovdqu64 zmm2, [r12+128] + vmovdqu64 zmm3, [r12+192] ; aes_dec_block vpxorq zmm0, zmm0, zmm8 vpxorq zmm1, zmm1, zmm8 @@ -3520,24 +3580,24 @@ L_AES_ECB_decrypt_avx512_256_aes_dec_block_last: vaesdeclast zmm1, zmm1, zmm7 vaesdeclast zmm2, zmm2, zmm7 vaesdeclast zmm3, zmm3, zmm7 - vmovdqu64 [r11], zmm0 - vmovdqu64 [r11+64], zmm1 - vmovdqu64 [r11+128], zmm2 - vmovdqu64 [r11+192], zmm3 - add eax, 256 - cmp eax, r9d + vmovdqu64 [r13], zmm0 + vmovdqu64 [r13+64], zmm1 + vmovdqu64 [r13+128], zmm2 + vmovdqu64 [r13+192], zmm3 + add r10d, 256 + cmp r10d, r11d jl L_AES_ECB_decrypt_avx512_dec_256 L_AES_ECB_decrypt_avx512_done_256: - mov r9d, r8d - and r9d, 4294967232 - cmp eax, r9d + mov r11d, r8d + and r11d, 4294967232 + cmp r10d, r11d je L_AES_ECB_decrypt_avx512_done_64 L_AES_ECB_decrypt_avx512_dec_64: ; 64 bytes of input ; aes_ecb_dec_64 - lea r10, QWORD PTR [rcx+rax] - lea r11, QWORD PTR [rdx+rax] - vmovdqu64 zmm0, [r10] + lea r12, QWORD PTR [rcx+r10] + lea r13, QWORD PTR [rdx+r10] + vmovdqu64 zmm0, [r12] ; aes_dec_block vpxorq zmm0, zmm0, zmm8 vaesdec zmm0, zmm0, zmm9 @@ -3562,19 +3622,19 @@ L_AES_ECB_decrypt_avx512_dec_64: vmovdqa64 zmm7, zmm22 L_AES_ECB_decrypt_avx512_64_aes_dec_block_last: vaesdeclast zmm0, zmm0, zmm7 - vmovdqu64 [r11], zmm0 - add eax, 64 - cmp eax, r9d + vmovdqu64 [r13], zmm0 + add r10d, 64 + cmp r10d, r11d jl L_AES_ECB_decrypt_avx512_dec_64 L_AES_ECB_decrypt_avx512_done_64: - cmp eax, r8d - mov r9d, r8d + cmp r10d, r8d + mov r11d, r8d je L_AES_ECB_decrypt_avx512_done_dec - and r9d, 4294967280 + and r11d, 4294967280 L_AES_ECB_decrypt_avx512_dec_16: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r10] + vmovdqu xmm0, OWORD PTR [r12] ; aes_dec_block vpxor xmm0, xmm0, [r9] vmovdqu xmm5, OWORD PTR [r9+16] @@ -3610,10 +3670,10 @@ L_AES_ECB_decrypt_avx512_dec_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_ECB_decrypt_avx512_16_aes_dec_block_last: vaesdeclast xmm0, xmm0, xmm5 - lea r10, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r10], xmm0 - add eax, 16 - cmp eax, r9d + lea r12, QWORD PTR [rdx+r10] + vmovdqu OWORD PTR [r12], xmm0 + add r10d, 16 + cmp r10d, r11d jl L_AES_ECB_decrypt_avx512_dec_16 L_AES_ECB_decrypt_avx512_done_dec: vmovdqu xmm6, OWORD PTR [rsp] @@ -3627,21 +3687,25 @@ L_AES_ECB_decrypt_avx512_done_dec: vmovdqu xmm14, OWORD PTR [rsp+128] vmovdqu xmm15, OWORD PTR [rsp+144] add rsp, 160 + pop r13 + pop r12 ret AES_ECB_decrypt_avx512 ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_encrypt_avx512 PROC - mov rax, QWORD PTR [rsp+40] - mov r10d, DWORD PTR [rsp+48] + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + mov r10d, DWORD PTR [rsp+64] vmovdqu xmm0, OWORD PTR [r8] - xor eax, eax - cmp eax, r9d + xor r11d, r11d + cmp r11d, r9d je L_AES_CBC_encrypt_avx512_done L_AES_CBC_encrypt_avx512_loop: ; 16 bytes of input - lea r10, QWORD PTR [rcx+rax] - vmovdqu xmm1, OWORD PTR [r10] + lea r12, QWORD PTR [rcx+r11] + vmovdqu xmm1, OWORD PTR [r12] vpternlogq xmm1, xmm0, [rax], 150 ; aes_enc_block vmovdqu xmm3, OWORD PTR [rax+16] @@ -3677,22 +3741,26 @@ L_AES_CBC_encrypt_avx512_loop: vmovdqu xmm3, OWORD PTR [rax+224] L_AES_CBC_encrypt_avx512_aes_enc_block_last: vaesenclast xmm1, xmm1, xmm3 - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm1 + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm1 vmovdqa xmm0, xmm1 - add eax, 16 - cmp eax, r9d + add r11d, 16 + cmp r11d, r9d jl L_AES_CBC_encrypt_avx512_loop L_AES_CBC_encrypt_avx512_done: vmovdqu OWORD PTR [r8], xmm0 + pop r13 + pop r12 ret AES_CBC_encrypt_avx512 ENDP _TEXT ENDS _TEXT SEGMENT READONLY PARA AES_CBC_decrypt_avx512 PROC push r12 - mov rax, QWORD PTR [rsp+48] - mov r10d, DWORD PTR [rsp+56] + push r13 + push r14 + mov rax, QWORD PTR [rsp+64] + mov r10d, DWORD PTR [rsp+72] sub rsp, 160 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 @@ -3705,47 +3773,47 @@ AES_CBC_decrypt_avx512 PROC vmovdqu OWORD PTR [rsp+128], xmm14 vmovdqu OWORD PTR [rsp+144], xmm15 vmovdqu xmm8, OWORD PTR [r8] - xor eax, eax + xor r11d, r11d cmp r9d, 64 jl L_AES_CBC_decrypt_avx512_done_64 - vbroadcasti32x4 zmm14, [rax] - vbroadcasti32x4 zmm15, [rax+16] - vbroadcasti32x4 zmm16, [rax+32] - vbroadcasti32x4 zmm17, [rax+48] - vbroadcasti32x4 zmm18, [rax+64] - vbroadcasti32x4 zmm19, [rax+80] - vbroadcasti32x4 zmm20, [rax+96] - vbroadcasti32x4 zmm21, [rax+112] - vbroadcasti32x4 zmm22, [rax+128] - vbroadcasti32x4 zmm23, [rax+144] - vbroadcasti32x4 zmm24, [rax+160] + vbroadcasti32x4 zmm14, OWORD PTR [rax] + vbroadcasti32x4 zmm15, OWORD PTR [rax+16] + vbroadcasti32x4 zmm16, OWORD PTR [rax+32] + vbroadcasti32x4 zmm17, OWORD PTR [rax+48] + vbroadcasti32x4 zmm18, OWORD PTR [rax+64] + vbroadcasti32x4 zmm19, OWORD PTR [rax+80] + vbroadcasti32x4 zmm20, OWORD PTR [rax+96] + vbroadcasti32x4 zmm21, OWORD PTR [rax+112] + vbroadcasti32x4 zmm22, OWORD PTR [rax+128] + vbroadcasti32x4 zmm23, OWORD PTR [rax+144] + vbroadcasti32x4 zmm24, OWORD PTR [rax+160] cmp r10d, 11 jl L_AES_CBC_decrypt_avx512_key_cached - vbroadcasti32x4 zmm25, [rax+176] - vbroadcasti32x4 zmm26, [rax+192] + vbroadcasti32x4 zmm25, OWORD PTR [rax+176] + vbroadcasti32x4 zmm26, OWORD PTR [rax+192] cmp r10d, 13 jl L_AES_CBC_decrypt_avx512_key_cached - vbroadcasti32x4 zmm27, [rax+208] - vbroadcasti32x4 zmm28, [rax+224] + vbroadcasti32x4 zmm27, OWORD PTR [rax+208] + vbroadcasti32x4 zmm28, OWORD PTR [rax+224] L_AES_CBC_decrypt_avx512_key_cached: cmp r9d, 256 - mov r10d, r9d + mov r12d, r9d jl L_AES_CBC_decrypt_avx512_done_256 - and r10d, 4294967040 + and r12d, 4294967040 L_AES_CBC_decrypt_avx512_dec_256: ; 256 bytes of input ; aes_cbc_dec_256 - lea r11, QWORD PTR [rcx+rax] - lea r12, QWORD PTR [rdx+rax] - vmovdqu64 zmm0, [r11] - vmovdqu64 zmm1, [r11+64] - vmovdqu64 zmm2, [r11+128] - vmovdqu64 zmm3, [r11+192] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] + vmovdqu64 zmm0, [r13] + vmovdqu64 zmm1, [r13+64] + vmovdqu64 zmm2, [r13+128] + vmovdqu64 zmm3, [r13+192] vshufi64x2 zmm10, zmm0, zmm0, 144 vinserti32x4 zmm10, zmm10, xmm8, 0 - vmovdqu64 zmm11, [r11+48] - vmovdqu64 zmm12, [r11+112] - vmovdqu64 zmm13, [r11+176] + vmovdqu64 zmm11, [r13+48] + vmovdqu64 zmm12, [r13+112] + vmovdqu64 zmm13, [r13+176] vextracti32x4 xmm8, zmm3, 3 ; aes_dec_block vpxorq zmm0, zmm0, zmm14 @@ -3820,24 +3888,24 @@ L_AES_CBC_decrypt_avx512_256_aes_dec_block_last: vpxorq zmm1, zmm1, zmm11 vpxorq zmm2, zmm2, zmm12 vpxorq zmm3, zmm3, zmm13 - vmovdqu64 [r12], zmm0 - vmovdqu64 [r12+64], zmm1 - vmovdqu64 [r12+128], zmm2 - vmovdqu64 [r12+192], zmm3 - add eax, 256 - cmp eax, r10d + vmovdqu64 [r14], zmm0 + vmovdqu64 [r14+64], zmm1 + vmovdqu64 [r14+128], zmm2 + vmovdqu64 [r14+192], zmm3 + add r11d, 256 + cmp r11d, r12d jl L_AES_CBC_decrypt_avx512_dec_256 L_AES_CBC_decrypt_avx512_done_256: - mov r10d, r9d - and r10d, 4294967232 - cmp eax, r10d + mov r12d, r9d + and r12d, 4294967232 + cmp r11d, r12d je L_AES_CBC_decrypt_avx512_done_64 L_AES_CBC_decrypt_avx512_dec_64: ; 64 bytes of input ; aes_cbc_dec_64 - lea r11, QWORD PTR [rcx+rax] - lea r12, QWORD PTR [rdx+rax] - vmovdqu64 zmm0, [r11] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] + vmovdqu64 zmm0, [r13] vshufi64x2 zmm10, zmm0, zmm0, 144 vinserti32x4 zmm10, zmm10, xmm8, 0 vextracti32x4 xmm8, zmm0, 3 @@ -3866,19 +3934,19 @@ L_AES_CBC_decrypt_avx512_dec_64: L_AES_CBC_decrypt_avx512_64_aes_dec_block_last: vaesdeclast zmm0, zmm0, zmm9 vpxorq zmm0, zmm0, zmm10 - vmovdqu64 [r12], zmm0 - add eax, 64 - cmp eax, r10d + vmovdqu64 [r14], zmm0 + add r11d, 64 + cmp r11d, r12d jl L_AES_CBC_decrypt_avx512_dec_64 L_AES_CBC_decrypt_avx512_done_64: - cmp eax, r9d - mov r10d, r9d + cmp r11d, r9d + mov r12d, r9d je L_AES_CBC_decrypt_avx512_done_dec - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CBC_decrypt_avx512_dec_16: ; 16 bytes of input - lea r11, QWORD PTR [rcx+rax] - vmovdqu xmm0, OWORD PTR [r11] + lea r13, QWORD PTR [rcx+r11] + vmovdqu xmm0, OWORD PTR [r13] vmovdqa xmm7, xmm0 ; aes_dec_block vpxor xmm0, xmm0, [rax] @@ -3917,10 +3985,10 @@ L_AES_CBC_decrypt_avx512_16_aes_dec_block_last: vaesdeclast xmm0, xmm0, xmm5 vpxor xmm0, xmm0, xmm8 vmovdqa xmm8, xmm7 - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CBC_decrypt_avx512_dec_16 L_AES_CBC_decrypt_avx512_done_dec: vmovdqu OWORD PTR [r8], xmm8 @@ -3935,43 +4003,45 @@ L_AES_CBC_decrypt_avx512_done_dec: vmovdqu xmm14, OWORD PTR [rsp+128] vmovdqu xmm15, OWORD PTR [rsp+144] add rsp, 160 + pop r14 + pop r13 pop r12 ret AES_CBC_decrypt_avx512 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_bswap_avx512 QWORD \ - 08090a0b0c0d0e0fh, 0001020304050607h +L_aes_ctr_bswap_avx512 QWORD 08090a0b0c0d0e0fh, 0001020304050607h ptr_L_aes_ctr_bswap_avx512 QWORD L_aes_ctr_bswap_avx512 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_aes_ctr_inc_avx512 QWORD \ - 0000000000000000h, 0000000000000000h, - 0000000000000001h, 0000000000000000h, - 0000000000000002h, 0000000000000000h, - 0000000000000003h, 0000000000000000h, - 0000000000000004h, 0000000000000000h, - 0000000000000005h, 0000000000000000h, - 0000000000000006h, 0000000000000000h, - 0000000000000007h, 0000000000000000h, - 0000000000000008h, 0000000000000000h, - 0000000000000009h, 0000000000000000h, - 000000000000000ah, 0000000000000000h, - 000000000000000bh, 0000000000000000h, - 000000000000000ch, 0000000000000000h, - 000000000000000dh, 0000000000000000h, - 000000000000000eh, 0000000000000000h, - 000000000000000fh, 0000000000000000h, - 0000000000000010h, 0000000000000000h +L_aes_ctr_inc_avx512 QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000001h, 0000000000000000h + QWORD 0000000000000002h, 0000000000000000h + QWORD 0000000000000003h, 0000000000000000h + QWORD 0000000000000004h, 0000000000000000h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000000000008h, 0000000000000000h + QWORD 0000000000000009h, 0000000000000000h + QWORD 000000000000000ah, 0000000000000000h + QWORD 000000000000000bh, 0000000000000000h + QWORD 000000000000000ch, 0000000000000000h + QWORD 000000000000000dh, 0000000000000000h + QWORD 000000000000000eh, 0000000000000000h + QWORD 000000000000000fh, 0000000000000000h + QWORD 0000000000000010h, 0000000000000000h ptr_L_aes_ctr_inc_avx512 QWORD L_aes_ctr_inc_avx512 _DATA ENDS _TEXT SEGMENT READONLY PARA AES_CTR_encrypt_avx512 PROC - push rbx - mov eax, DWORD PTR [rsp+48] - mov r10, QWORD PTR [rsp+56] + push r12 + push r13 + push r14 + mov eax, DWORD PTR [rsp+64] + mov r10, QWORD PTR [rsp+72] sub rsp, 160 vmovdqu OWORD PTR [rsp], xmm6 vmovdqu OWORD PTR [rsp+16], xmm7 @@ -3984,38 +4054,38 @@ AES_CTR_encrypt_avx512 PROC vmovdqu OWORD PTR [rsp+128], xmm14 vmovdqu OWORD PTR [rsp+144], xmm15 vbroadcasti32x4 zmm8, ptr_L_aes_ctr_bswap_avx512 - vbroadcasti32x4 zmm7, [r10] + vbroadcasti32x4 zmm7, OWORD PTR [r10] vpshufb zmm7, zmm7, zmm8 - vbroadcasti32x4 zmm10, [ptr_L_aes_ctr_inc_avx512+256] - vbroadcasti32x4 zmm11, [ptr_L_aes_ctr_inc_avx512+64] - vbroadcasti32x4 zmm12, [ptr_L_aes_ctr_inc_avx512+16] - xor eax, eax + vbroadcasti32x4 zmm10, OWORD PTR [ptr_L_aes_ctr_inc_avx512+256] + vbroadcasti32x4 zmm11, OWORD PTR [ptr_L_aes_ctr_inc_avx512+64] + vbroadcasti32x4 zmm12, OWORD PTR [ptr_L_aes_ctr_inc_avx512+16] + xor r11d, r11d cmp r8d, 64 jl L_AES_CTR_encrypt_avx512_done_64 - vbroadcasti32x4 zmm14, [r9] - vbroadcasti32x4 zmm15, [r9+16] - vbroadcasti32x4 zmm16, [r9+32] - vbroadcasti32x4 zmm17, [r9+48] - vbroadcasti32x4 zmm18, [r9+64] - vbroadcasti32x4 zmm19, [r9+80] - vbroadcasti32x4 zmm20, [r9+96] - vbroadcasti32x4 zmm21, [r9+112] - vbroadcasti32x4 zmm22, [r9+128] - vbroadcasti32x4 zmm23, [r9+144] - vbroadcasti32x4 zmm24, [r9+160] + vbroadcasti32x4 zmm14, OWORD PTR [r9] + vbroadcasti32x4 zmm15, OWORD PTR [r9+16] + vbroadcasti32x4 zmm16, OWORD PTR [r9+32] + vbroadcasti32x4 zmm17, OWORD PTR [r9+48] + vbroadcasti32x4 zmm18, OWORD PTR [r9+64] + vbroadcasti32x4 zmm19, OWORD PTR [r9+80] + vbroadcasti32x4 zmm20, OWORD PTR [r9+96] + vbroadcasti32x4 zmm21, OWORD PTR [r9+112] + vbroadcasti32x4 zmm22, OWORD PTR [r9+128] + vbroadcasti32x4 zmm23, OWORD PTR [r9+144] + vbroadcasti32x4 zmm24, OWORD PTR [r9+160] cmp eax, 11 jl L_AES_CTR_encrypt_avx512_key_cached - vbroadcasti32x4 zmm25, [r9+176] - vbroadcasti32x4 zmm26, [r9+192] + vbroadcasti32x4 zmm25, OWORD PTR [r9+176] + vbroadcasti32x4 zmm26, OWORD PTR [r9+192] cmp eax, 13 jl L_AES_CTR_encrypt_avx512_key_cached - vbroadcasti32x4 zmm27, [r9+208] - vbroadcasti32x4 zmm28, [r9+224] + vbroadcasti32x4 zmm27, OWORD PTR [r9+208] + vbroadcasti32x4 zmm28, OWORD PTR [r9+224] L_AES_CTR_encrypt_avx512_key_cached: cmp r8d, 256 - mov r10d, r8d + mov r12d, r8d jl L_AES_CTR_encrypt_avx512_done_256 - and r10d, 4294967040 + and r12d, 4294967040 vmovdqa64 zmm9, zmm7 vpaddq zmm4, zmm7, [ptr_L_aes_ctr_inc_avx512] vpternlogq zmm9, zmm4, [ptr_L_aes_ctr_inc_avx512], 178 @@ -4042,8 +4112,8 @@ L_AES_CTR_encrypt_avx512_key_cached: vpaddq zmm7, zmm7, zmm9 L_AES_CTR_encrypt_avx512_enc_256: ; 256 bytes of input - lea r11, QWORD PTR [rcx+rax] - lea rbx, QWORD PTR [rdx+rax] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] vpshufb zmm0, zmm4, zmm8 vpshufb zmm1, zmm5, zmm8 vpshufb zmm2, zmm6, zmm8 @@ -4141,28 +4211,28 @@ L_AES_CTR_encrypt_avx512_256_aes_enc_block_last: vaesenclast zmm1, zmm1, zmm13 vaesenclast zmm2, zmm2, zmm13 vaesenclast zmm3, zmm3, zmm13 - vpxorq zmm0, zmm0, [r11] - vpxorq zmm1, zmm1, [r11+64] - vpxorq zmm2, zmm2, [r11+128] - vpxorq zmm3, zmm3, [r11+192] - vmovdqu64 [rbx], zmm0 - vmovdqu64 [rbx+64], zmm1 - vmovdqu64 [rbx+128], zmm2 - vmovdqu64 [rbx+192], zmm3 - add eax, 256 - cmp eax, r10d + vpxorq zmm0, zmm0, [r13] + vpxorq zmm1, zmm1, [r13+64] + vpxorq zmm2, zmm2, [r13+128] + vpxorq zmm3, zmm3, [r13+192] + vmovdqu64 [r14], zmm0 + vmovdqu64 [r14+64], zmm1 + vmovdqu64 [r14+128], zmm2 + vmovdqu64 [r14+192], zmm3 + add r11d, 256 + cmp r11d, r12d jl L_AES_CTR_encrypt_avx512_enc_256 vshufi64x2 zmm7, zmm4, zmm4, 0 L_AES_CTR_encrypt_avx512_done_256: - mov r10d, r8d - and r10d, 4294967232 - cmp eax, r10d + mov r12d, r8d + and r12d, 4294967232 + cmp r11d, r12d je L_AES_CTR_encrypt_avx512_done_64 L_AES_CTR_encrypt_avx512_enc_64: ; 64 bytes of input ; aes_ctr_enc_64 - lea r11, QWORD PTR [rcx+rax] - lea rbx, QWORD PTR [rdx+rax] + lea r13, QWORD PTR [rcx+r11] + lea r14, QWORD PTR [rdx+r11] vpaddq zmm0, zmm7, [ptr_L_aes_ctr_inc_avx512] vmovdqa64 zmm9, zmm7 vpternlogq zmm9, zmm0, [ptr_L_aes_ctr_inc_avx512], 178 @@ -4200,16 +4270,16 @@ L_AES_CTR_encrypt_avx512_enc_64: vmovdqa64 zmm13, zmm28 L_AES_CTR_encrypt_avx512_64_aes_enc_block_last: vaesenclast zmm0, zmm0, zmm13 - vpxorq zmm0, zmm0, [r11] - vmovdqu64 [rbx], zmm0 - add eax, 64 - cmp eax, r10d + vpxorq zmm0, zmm0, [r13] + vmovdqu64 [r14], zmm0 + add r11d, 64 + cmp r11d, r12d jl L_AES_CTR_encrypt_avx512_enc_64 L_AES_CTR_encrypt_avx512_done_64: - cmp eax, r8d - mov r10d, r8d + cmp r11d, r8d + mov r12d, r8d je L_AES_CTR_encrypt_avx512_done_enc - and r10d, 4294967280 + and r12d, 4294967280 L_AES_CTR_encrypt_avx512_enc_16: ; 16 bytes of input vpshufb xmm0, xmm7, xmm8 @@ -4254,12 +4324,12 @@ L_AES_CTR_encrypt_avx512_enc_16: vmovdqu xmm5, OWORD PTR [r9+224] L_AES_CTR_encrypt_avx512_16_aes_enc_block_last: vaesenclast xmm0, xmm0, xmm5 - lea r11, QWORD PTR [rcx+rax] - vpxor xmm0, xmm0, [r11] - lea r11, QWORD PTR [rdx+rax] - vmovdqu OWORD PTR [r11], xmm0 - add eax, 16 - cmp eax, r10d + lea r13, QWORD PTR [rcx+r11] + vpxor xmm0, xmm0, [r13] + lea r13, QWORD PTR [rdx+r11] + vmovdqu OWORD PTR [r13], xmm0 + add r11d, 16 + cmp r11d, r12d jl L_AES_CTR_encrypt_avx512_enc_16 L_AES_CTR_encrypt_avx512_done_enc: vpshufb xmm0, xmm7, xmm8 @@ -4275,7 +4345,9 @@ L_AES_CTR_encrypt_avx512_done_enc: vmovdqu xmm14, OWORD PTR [rsp+128] vmovdqu xmm15, OWORD PTR [rsp+144] add rsp, 160 - pop rbx + pop r14 + pop r13 + pop r12 ret AES_CTR_encrypt_avx512 ENDP _TEXT ENDS diff --git a/wolfcrypt/src/aes_xts_asm.asm b/wolfcrypt/src/aes_xts_asm.asm index a904ffa4ce7..d11e836fa0e 100644 --- a/wolfcrypt/src/aes_xts_asm.asm +++ b/wolfcrypt/src/aes_xts_asm.asm @@ -85,8 +85,7 @@ AES_XTS_init_aesni ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_aes_xts_gc_xts DWORD \ - 00000087h, 00000001h, 00000001h, 00000001h +L_aes_xts_gc_xts DWORD 00000087h, 00000001h, 00000001h, 00000001h ptr_L_aes_xts_gc_xts QWORD L_aes_xts_gc_xts _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -1500,8 +1499,7 @@ AES_XTS_init_avx1 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_avx1_aes_xts_gc_xts DWORD \ - 00000087h, 00000001h, 00000001h, 00000001h +L_avx1_aes_xts_gc_xts DWORD 00000087h, 00000001h, 00000001h, 00000001h ptr_L_avx1_aes_xts_gc_xts QWORD L_avx1_aes_xts_gc_xts _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -2876,28 +2874,24 @@ AES_XTS_init_vaes ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_xts_gc_xts DWORD \ - 00000087h, 00000000h, 00000001h, 00000000h +L_vaes_aes_xts_gc_xts DWORD 00000087h, 00000000h, 00000001h, 00000000h ptr_L_vaes_aes_xts_gc_xts QWORD L_vaes_aes_xts_gc_xts _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_xts_poly DWORD \ - 00000087h, 00000000h, 00000000h, 00000000h +L_vaes_aes_xts_poly DWORD 00000087h, 00000000h, 00000000h, 00000000h ptr_L_vaes_aes_xts_poly QWORD L_vaes_aes_xts_poly _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_xts_shl DWORD \ - 00000000h, 00000000h, 00000000h, 00000000h, - 00000001h, 00000000h, 00000001h, 00000000h +L_vaes_aes_xts_shl DWORD 00000000h, 00000000h, 00000000h, 00000000h + DWORD 00000001h, 00000000h, 00000001h, 00000000h ptr_L_vaes_aes_xts_shl QWORD L_vaes_aes_xts_shl _DATA ENDS _DATA SEGMENT ALIGN 16 -L_vaes_aes_xts_shr DWORD \ - 00000040h, 00000000h, 00000040h, 00000000h, - 0000003fh, 00000000h, 0000003fh, 00000000h +L_vaes_aes_xts_shr DWORD 00000040h, 00000000h, 00000040h, 00000000h + DWORD 0000003fh, 00000000h, 0000003fh, 00000000h ptr_L_vaes_aes_xts_shr QWORD L_vaes_aes_xts_shr _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -3006,7 +3000,7 @@ L_AES_XTS_encrypt_vaes_enc_128: vmovdqu ymm2, YMMWORD PTR [rcx+64] vmovdqu ymm3, YMMWORD PTR [rcx+96] ; aes_enc_block - vbroadcasti128 ymm9, [r8] + vbroadcasti128 ymm9, OWORD PTR [r8] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 @@ -3015,76 +3009,76 @@ L_AES_XTS_encrypt_vaes_enc_128: vpxor ymm2, ymm2, ymm9 vpxor ymm3, ymm3, ymm7 vpxor ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+16] + vbroadcasti128 ymm9, OWORD PTR [r8+16] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+32] + vbroadcasti128 ymm9, OWORD PTR [r8+32] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+48] + vbroadcasti128 ymm9, OWORD PTR [r8+48] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+64] + vbroadcasti128 ymm9, OWORD PTR [r8+64] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+80] + vbroadcasti128 ymm9, OWORD PTR [r8+80] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+96] + vbroadcasti128 ymm9, OWORD PTR [r8+96] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+112] + vbroadcasti128 ymm9, OWORD PTR [r8+112] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+128] + vbroadcasti128 ymm9, OWORD PTR [r8+128] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+144] + vbroadcasti128 ymm9, OWORD PTR [r8+144] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [r8+160] + vbroadcasti128 ymm9, OWORD PTR [r8+160] jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+176] + vbroadcasti128 ymm9, OWORD PTR [r8+176] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [r8+192] + vbroadcasti128 ymm9, OWORD PTR [r8+192] jl L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+208] + vbroadcasti128 ymm9, OWORD PTR [r8+208] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+224] + vbroadcasti128 ymm9, OWORD PTR [r8+224] L_AES_XTS_encrypt_vaes_aes_enc_128_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm9 vaesenclast ymm1, ymm1, ymm9 @@ -3151,55 +3145,55 @@ L_AES_XTS_encrypt_vaes_done_128: vpxor ymm5, ymm5, ymm10 vpxor ymm5, ymm5, ymm9 ; aes_enc_block - vbroadcasti128 ymm9, [r8] + vbroadcasti128 ymm9, OWORD PTR [r8] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 vpxor ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+16] + vbroadcasti128 ymm9, OWORD PTR [r8+16] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+32] + vbroadcasti128 ymm9, OWORD PTR [r8+32] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+48] + vbroadcasti128 ymm9, OWORD PTR [r8+48] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+64] + vbroadcasti128 ymm9, OWORD PTR [r8+64] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+80] + vbroadcasti128 ymm9, OWORD PTR [r8+80] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+96] + vbroadcasti128 ymm9, OWORD PTR [r8+96] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+112] + vbroadcasti128 ymm9, OWORD PTR [r8+112] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+128] + vbroadcasti128 ymm9, OWORD PTR [r8+128] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+144] + vbroadcasti128 ymm9, OWORD PTR [r8+144] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [r8+160] + vbroadcasti128 ymm9, OWORD PTR [r8+160] jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+176] + vbroadcasti128 ymm9, OWORD PTR [r8+176] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [r8+192] + vbroadcasti128 ymm9, OWORD PTR [r8+192] jl L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+208] + vbroadcasti128 ymm9, OWORD PTR [r8+208] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+224] + vbroadcasti128 ymm9, OWORD PTR [r8+224] L_AES_XTS_encrypt_vaes_aes_enc_64_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm9 vaesenclast ymm1, ymm1, ymm9 @@ -3232,40 +3226,40 @@ L_AES_XTS_encrypt_vaes_done_64: vpxor ymm4, ymm4, ymm7 vpxor ymm4, ymm4, ymm6 ; aes_enc_block - vbroadcasti128 ymm9, [r8] + vbroadcasti128 ymm9, OWORD PTR [r8] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+16] + vbroadcasti128 ymm9, OWORD PTR [r8+16] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+32] + vbroadcasti128 ymm9, OWORD PTR [r8+32] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+48] + vbroadcasti128 ymm9, OWORD PTR [r8+48] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+64] + vbroadcasti128 ymm9, OWORD PTR [r8+64] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+80] + vbroadcasti128 ymm9, OWORD PTR [r8+80] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+96] + vbroadcasti128 ymm9, OWORD PTR [r8+96] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+112] + vbroadcasti128 ymm9, OWORD PTR [r8+112] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+128] + vbroadcasti128 ymm9, OWORD PTR [r8+128] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+144] + vbroadcasti128 ymm9, OWORD PTR [r8+144] vaesenc ymm0, ymm0, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [r8+160] + vbroadcasti128 ymm9, OWORD PTR [r8+160] jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+176] + vbroadcasti128 ymm9, OWORD PTR [r8+176] vaesenc ymm0, ymm0, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [r8+192] + vbroadcasti128 ymm9, OWORD PTR [r8+192] jl L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+208] + vbroadcasti128 ymm9, OWORD PTR [r8+208] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+224] + vbroadcasti128 ymm9, OWORD PTR [r8+224] L_AES_XTS_encrypt_vaes_aes_enc_32_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm9 vpxor ymm0, ymm0, ymm4 @@ -3485,7 +3479,7 @@ L_AES_XTS_encrypt_update_vaes_enc_128: vmovdqu ymm2, YMMWORD PTR [rcx+64] vmovdqu ymm3, YMMWORD PTR [rcx+96] ; aes_enc_block - vbroadcasti128 ymm9, [r10] + vbroadcasti128 ymm9, OWORD PTR [r10] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 @@ -3494,76 +3488,76 @@ L_AES_XTS_encrypt_update_vaes_enc_128: vpxor ymm2, ymm2, ymm9 vpxor ymm3, ymm3, ymm7 vpxor ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+16] + vbroadcasti128 ymm9, OWORD PTR [r10+16] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+32] + vbroadcasti128 ymm9, OWORD PTR [r10+32] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+48] + vbroadcasti128 ymm9, OWORD PTR [r10+48] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+64] + vbroadcasti128 ymm9, OWORD PTR [r10+64] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+80] + vbroadcasti128 ymm9, OWORD PTR [r10+80] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+96] + vbroadcasti128 ymm9, OWORD PTR [r10+96] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+112] + vbroadcasti128 ymm9, OWORD PTR [r10+112] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+128] + vbroadcasti128 ymm9, OWORD PTR [r10+128] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+144] + vbroadcasti128 ymm9, OWORD PTR [r10+144] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 cmp r9d, 11 - vbroadcasti128 ymm9, [r10+160] + vbroadcasti128 ymm9, OWORD PTR [r10+160] jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+176] + vbroadcasti128 ymm9, OWORD PTR [r10+176] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 cmp r9d, 13 - vbroadcasti128 ymm9, [r10+192] + vbroadcasti128 ymm9, OWORD PTR [r10+192] jl L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+208] + vbroadcasti128 ymm9, OWORD PTR [r10+208] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 vaesenc ymm2, ymm2, ymm9 vaesenc ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+224] + vbroadcasti128 ymm9, OWORD PTR [r10+224] L_AES_XTS_encrypt_update_vaes_aes_enc_128_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm9 vaesenclast ymm1, ymm1, ymm9 @@ -3630,55 +3624,55 @@ L_AES_XTS_encrypt_update_vaes_done_128: vpxor ymm5, ymm5, ymm10 vpxor ymm5, ymm5, ymm9 ; aes_enc_block - vbroadcasti128 ymm9, [r10] + vbroadcasti128 ymm9, OWORD PTR [r10] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 vpxor ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+16] + vbroadcasti128 ymm9, OWORD PTR [r10+16] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+32] + vbroadcasti128 ymm9, OWORD PTR [r10+32] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+48] + vbroadcasti128 ymm9, OWORD PTR [r10+48] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+64] + vbroadcasti128 ymm9, OWORD PTR [r10+64] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+80] + vbroadcasti128 ymm9, OWORD PTR [r10+80] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+96] + vbroadcasti128 ymm9, OWORD PTR [r10+96] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+112] + vbroadcasti128 ymm9, OWORD PTR [r10+112] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+128] + vbroadcasti128 ymm9, OWORD PTR [r10+128] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+144] + vbroadcasti128 ymm9, OWORD PTR [r10+144] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 cmp r9d, 11 - vbroadcasti128 ymm9, [r10+160] + vbroadcasti128 ymm9, OWORD PTR [r10+160] jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+176] + vbroadcasti128 ymm9, OWORD PTR [r10+176] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 cmp r9d, 13 - vbroadcasti128 ymm9, [r10+192] + vbroadcasti128 ymm9, OWORD PTR [r10+192] jl L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+208] + vbroadcasti128 ymm9, OWORD PTR [r10+208] vaesenc ymm0, ymm0, ymm9 vaesenc ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+224] + vbroadcasti128 ymm9, OWORD PTR [r10+224] L_AES_XTS_encrypt_update_vaes_aes_enc_64_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm9 vaesenclast ymm1, ymm1, ymm9 @@ -3711,40 +3705,40 @@ L_AES_XTS_encrypt_update_vaes_done_64: vpxor ymm4, ymm4, ymm7 vpxor ymm4, ymm4, ymm6 ; aes_enc_block - vbroadcasti128 ymm9, [r10] + vbroadcasti128 ymm9, OWORD PTR [r10] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+16] + vbroadcasti128 ymm9, OWORD PTR [r10+16] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+32] + vbroadcasti128 ymm9, OWORD PTR [r10+32] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+48] + vbroadcasti128 ymm9, OWORD PTR [r10+48] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+64] + vbroadcasti128 ymm9, OWORD PTR [r10+64] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+80] + vbroadcasti128 ymm9, OWORD PTR [r10+80] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+96] + vbroadcasti128 ymm9, OWORD PTR [r10+96] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+112] + vbroadcasti128 ymm9, OWORD PTR [r10+112] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+128] + vbroadcasti128 ymm9, OWORD PTR [r10+128] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+144] + vbroadcasti128 ymm9, OWORD PTR [r10+144] vaesenc ymm0, ymm0, ymm9 cmp r9d, 11 - vbroadcasti128 ymm9, [r10+160] + vbroadcasti128 ymm9, OWORD PTR [r10+160] jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+176] + vbroadcasti128 ymm9, OWORD PTR [r10+176] vaesenc ymm0, ymm0, ymm9 cmp r9d, 13 - vbroadcasti128 ymm9, [r10+192] + vbroadcasti128 ymm9, OWORD PTR [r10+192] jl L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+208] + vbroadcasti128 ymm9, OWORD PTR [r10+208] vaesenc ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+224] + vbroadcasti128 ymm9, OWORD PTR [r10+224] L_AES_XTS_encrypt_update_vaes_aes_enc_32_aes_enc_block_last: vaesenclast ymm0, ymm0, ymm9 vpxor ymm0, ymm0, ymm4 @@ -4008,7 +4002,7 @@ L_AES_XTS_decrypt_vaes_dec_128: vmovdqu ymm2, YMMWORD PTR [rcx+64] vmovdqu ymm3, YMMWORD PTR [rcx+96] ; aes_dec_block - vbroadcasti128 ymm9, [r8] + vbroadcasti128 ymm9, OWORD PTR [r8] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 @@ -4017,76 +4011,76 @@ L_AES_XTS_decrypt_vaes_dec_128: vpxor ymm2, ymm2, ymm9 vpxor ymm3, ymm3, ymm7 vpxor ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+16] + vbroadcasti128 ymm9, OWORD PTR [r8+16] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+32] + vbroadcasti128 ymm9, OWORD PTR [r8+32] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+48] + vbroadcasti128 ymm9, OWORD PTR [r8+48] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+64] + vbroadcasti128 ymm9, OWORD PTR [r8+64] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+80] + vbroadcasti128 ymm9, OWORD PTR [r8+80] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+96] + vbroadcasti128 ymm9, OWORD PTR [r8+96] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+112] + vbroadcasti128 ymm9, OWORD PTR [r8+112] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+128] + vbroadcasti128 ymm9, OWORD PTR [r8+128] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+144] + vbroadcasti128 ymm9, OWORD PTR [r8+144] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [r8+160] + vbroadcasti128 ymm9, OWORD PTR [r8+160] jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+176] + vbroadcasti128 ymm9, OWORD PTR [r8+176] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [r8+192] + vbroadcasti128 ymm9, OWORD PTR [r8+192] jl L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+208] + vbroadcasti128 ymm9, OWORD PTR [r8+208] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r8+224] + vbroadcasti128 ymm9, OWORD PTR [r8+224] L_AES_XTS_decrypt_vaes_aes_dec_128_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vaesdeclast ymm1, ymm1, ymm9 @@ -4164,55 +4158,55 @@ L_AES_XTS_decrypt_vaes_mul16_64: vpxor ymm5, ymm5, ymm10 vpxor ymm5, ymm5, ymm9 ; aes_dec_block - vbroadcasti128 ymm9, [r8] + vbroadcasti128 ymm9, OWORD PTR [r8] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 vpxor ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+16] + vbroadcasti128 ymm9, OWORD PTR [r8+16] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+32] + vbroadcasti128 ymm9, OWORD PTR [r8+32] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+48] + vbroadcasti128 ymm9, OWORD PTR [r8+48] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+64] + vbroadcasti128 ymm9, OWORD PTR [r8+64] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+80] + vbroadcasti128 ymm9, OWORD PTR [r8+80] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+96] + vbroadcasti128 ymm9, OWORD PTR [r8+96] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+112] + vbroadcasti128 ymm9, OWORD PTR [r8+112] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+128] + vbroadcasti128 ymm9, OWORD PTR [r8+128] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+144] + vbroadcasti128 ymm9, OWORD PTR [r8+144] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [r8+160] + vbroadcasti128 ymm9, OWORD PTR [r8+160] jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+176] + vbroadcasti128 ymm9, OWORD PTR [r8+176] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [r8+192] + vbroadcasti128 ymm9, OWORD PTR [r8+192] jl L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+208] + vbroadcasti128 ymm9, OWORD PTR [r8+208] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r8+224] + vbroadcasti128 ymm9, OWORD PTR [r8+224] L_AES_XTS_decrypt_vaes_aes_dec_64_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vaesdeclast ymm1, ymm1, ymm9 @@ -4256,40 +4250,40 @@ L_AES_XTS_decrypt_vaes_mul16_32: vpxor ymm4, ymm4, ymm7 vpxor ymm4, ymm4, ymm6 ; aes_dec_block - vbroadcasti128 ymm9, [r8] + vbroadcasti128 ymm9, OWORD PTR [r8] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+16] + vbroadcasti128 ymm9, OWORD PTR [r8+16] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+32] + vbroadcasti128 ymm9, OWORD PTR [r8+32] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+48] + vbroadcasti128 ymm9, OWORD PTR [r8+48] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+64] + vbroadcasti128 ymm9, OWORD PTR [r8+64] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+80] + vbroadcasti128 ymm9, OWORD PTR [r8+80] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+96] + vbroadcasti128 ymm9, OWORD PTR [r8+96] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+112] + vbroadcasti128 ymm9, OWORD PTR [r8+112] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+128] + vbroadcasti128 ymm9, OWORD PTR [r8+128] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+144] + vbroadcasti128 ymm9, OWORD PTR [r8+144] vaesdec ymm0, ymm0, ymm9 cmp r10d, 11 - vbroadcasti128 ymm9, [r8+160] + vbroadcasti128 ymm9, OWORD PTR [r8+160] jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+176] + vbroadcasti128 ymm9, OWORD PTR [r8+176] vaesdec ymm0, ymm0, ymm9 cmp r10d, 13 - vbroadcasti128 ymm9, [r8+192] + vbroadcasti128 ymm9, OWORD PTR [r8+192] jl L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+208] + vbroadcasti128 ymm9, OWORD PTR [r8+208] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r8+224] + vbroadcasti128 ymm9, OWORD PTR [r8+224] L_AES_XTS_decrypt_vaes_aes_dec_32_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vpxor ymm0, ymm0, ymm4 @@ -4561,7 +4555,7 @@ L_AES_XTS_decrypt_update_vaes_dec_128: vmovdqu ymm2, YMMWORD PTR [rcx+64] vmovdqu ymm3, YMMWORD PTR [rcx+96] ; aes_dec_block - vbroadcasti128 ymm9, [r10] + vbroadcasti128 ymm9, OWORD PTR [r10] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 @@ -4570,76 +4564,76 @@ L_AES_XTS_decrypt_update_vaes_dec_128: vpxor ymm2, ymm2, ymm9 vpxor ymm3, ymm3, ymm7 vpxor ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+16] + vbroadcasti128 ymm9, OWORD PTR [r10+16] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+32] + vbroadcasti128 ymm9, OWORD PTR [r10+32] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+48] + vbroadcasti128 ymm9, OWORD PTR [r10+48] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+64] + vbroadcasti128 ymm9, OWORD PTR [r10+64] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+80] + vbroadcasti128 ymm9, OWORD PTR [r10+80] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+96] + vbroadcasti128 ymm9, OWORD PTR [r10+96] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+112] + vbroadcasti128 ymm9, OWORD PTR [r10+112] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+128] + vbroadcasti128 ymm9, OWORD PTR [r10+128] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+144] + vbroadcasti128 ymm9, OWORD PTR [r10+144] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 cmp r9d, 11 - vbroadcasti128 ymm9, [r10+160] + vbroadcasti128 ymm9, OWORD PTR [r10+160] jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+176] + vbroadcasti128 ymm9, OWORD PTR [r10+176] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 cmp r9d, 13 - vbroadcasti128 ymm9, [r10+192] + vbroadcasti128 ymm9, OWORD PTR [r10+192] jl L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+208] + vbroadcasti128 ymm9, OWORD PTR [r10+208] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 vaesdec ymm2, ymm2, ymm9 vaesdec ymm3, ymm3, ymm9 - vbroadcasti128 ymm9, [r10+224] + vbroadcasti128 ymm9, OWORD PTR [r10+224] L_AES_XTS_decrypt_update_vaes_aes_dec_128_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vaesdeclast ymm1, ymm1, ymm9 @@ -4717,55 +4711,55 @@ L_AES_XTS_decrypt_update_vaes_mul16_64: vpxor ymm5, ymm5, ymm10 vpxor ymm5, ymm5, ymm9 ; aes_dec_block - vbroadcasti128 ymm9, [r10] + vbroadcasti128 ymm9, OWORD PTR [r10] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 vpxor ymm1, ymm1, ymm5 vpxor ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+16] + vbroadcasti128 ymm9, OWORD PTR [r10+16] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+32] + vbroadcasti128 ymm9, OWORD PTR [r10+32] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+48] + vbroadcasti128 ymm9, OWORD PTR [r10+48] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+64] + vbroadcasti128 ymm9, OWORD PTR [r10+64] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+80] + vbroadcasti128 ymm9, OWORD PTR [r10+80] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+96] + vbroadcasti128 ymm9, OWORD PTR [r10+96] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+112] + vbroadcasti128 ymm9, OWORD PTR [r10+112] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+128] + vbroadcasti128 ymm9, OWORD PTR [r10+128] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+144] + vbroadcasti128 ymm9, OWORD PTR [r10+144] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 cmp r9d, 11 - vbroadcasti128 ymm9, [r10+160] + vbroadcasti128 ymm9, OWORD PTR [r10+160] jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+176] + vbroadcasti128 ymm9, OWORD PTR [r10+176] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 cmp r9d, 13 - vbroadcasti128 ymm9, [r10+192] + vbroadcasti128 ymm9, OWORD PTR [r10+192] jl L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+208] + vbroadcasti128 ymm9, OWORD PTR [r10+208] vaesdec ymm0, ymm0, ymm9 vaesdec ymm1, ymm1, ymm9 - vbroadcasti128 ymm9, [r10+224] + vbroadcasti128 ymm9, OWORD PTR [r10+224] L_AES_XTS_decrypt_update_vaes_aes_dec_64_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vaesdeclast ymm1, ymm1, ymm9 @@ -4809,40 +4803,40 @@ L_AES_XTS_decrypt_update_vaes_mul16_32: vpxor ymm4, ymm4, ymm7 vpxor ymm4, ymm4, ymm6 ; aes_dec_block - vbroadcasti128 ymm9, [r10] + vbroadcasti128 ymm9, OWORD PTR [r10] vpxor ymm0, ymm0, ymm4 vpxor ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+16] + vbroadcasti128 ymm9, OWORD PTR [r10+16] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+32] + vbroadcasti128 ymm9, OWORD PTR [r10+32] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+48] + vbroadcasti128 ymm9, OWORD PTR [r10+48] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+64] + vbroadcasti128 ymm9, OWORD PTR [r10+64] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+80] + vbroadcasti128 ymm9, OWORD PTR [r10+80] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+96] + vbroadcasti128 ymm9, OWORD PTR [r10+96] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+112] + vbroadcasti128 ymm9, OWORD PTR [r10+112] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+128] + vbroadcasti128 ymm9, OWORD PTR [r10+128] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+144] + vbroadcasti128 ymm9, OWORD PTR [r10+144] vaesdec ymm0, ymm0, ymm9 cmp r9d, 11 - vbroadcasti128 ymm9, [r10+160] + vbroadcasti128 ymm9, OWORD PTR [r10+160] jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+176] + vbroadcasti128 ymm9, OWORD PTR [r10+176] vaesdec ymm0, ymm0, ymm9 cmp r9d, 13 - vbroadcasti128 ymm9, [r10+192] + vbroadcasti128 ymm9, OWORD PTR [r10+192] jl L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+208] + vbroadcasti128 ymm9, OWORD PTR [r10+208] vaesdec ymm0, ymm0, ymm9 - vbroadcasti128 ymm9, [r10+224] + vbroadcasti128 ymm9, OWORD PTR [r10+224] L_AES_XTS_decrypt_update_vaes_aes_dec_32_aes_dec_block_last: vaesdeclast ymm0, ymm0, ymm9 vpxor ymm0, ymm0, ymm4 @@ -5084,32 +5078,28 @@ AES_XTS_init_avx512 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_xts_gc_xts DWORD \ - 00000087h, 00000000h, 00000001h, 00000000h +L_avx512_aes_xts_gc_xts DWORD 00000087h, 00000000h, 00000001h, 00000000h ptr_L_avx512_aes_xts_gc_xts QWORD L_avx512_aes_xts_gc_xts _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_xts_poly DWORD \ - 00000087h, 00000000h, 00000000h, 00000000h +L_avx512_aes_xts_poly DWORD 00000087h, 00000000h, 00000000h, 00000000h ptr_L_avx512_aes_xts_poly QWORD L_avx512_aes_xts_poly _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_xts_shl DWORD \ - 00000000h, 00000000h, 00000000h, 00000000h, - 00000001h, 00000000h, 00000001h, 00000000h, - 00000002h, 00000000h, 00000002h, 00000000h, - 00000003h, 00000000h, 00000003h, 00000000h +L_avx512_aes_xts_shl DWORD 00000000h, 00000000h, 00000000h, 00000000h + DWORD 00000001h, 00000000h, 00000001h, 00000000h + DWORD 00000002h, 00000000h, 00000002h, 00000000h + DWORD 00000003h, 00000000h, 00000003h, 00000000h ptr_L_avx512_aes_xts_shl QWORD L_avx512_aes_xts_shl _DATA ENDS _DATA SEGMENT ALIGN 16 -L_avx512_aes_xts_shr DWORD \ - 00000040h, 00000000h, 00000040h, 00000000h, - 0000003fh, 00000000h, 0000003fh, 00000000h, - 0000003eh, 00000000h, 0000003eh, 00000000h, - 0000003dh, 00000000h, 0000003dh, 00000000h +L_avx512_aes_xts_shr DWORD 00000040h, 00000000h, 00000040h, 00000000h + DWORD 0000003fh, 00000000h, 0000003fh, 00000000h + DWORD 0000003eh, 00000000h, 0000003eh, 00000000h + DWORD 0000003dh, 00000000h, 0000003dh, 00000000h ptr_L_avx512_aes_xts_shr QWORD L_avx512_aes_xts_shr _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -5179,25 +5169,25 @@ L_AES_XTS_encrypt_avx512_tweak_aes_enc_block_last: xor r13d, r13d cmp eax, 32 jl L_AES_XTS_encrypt_avx512_done_128 - vbroadcasti32x4 zmm16, [r8] - vbroadcasti32x4 zmm17, [r8+16] - vbroadcasti32x4 zmm18, [r8+32] - vbroadcasti32x4 zmm19, [r8+48] - vbroadcasti32x4 zmm20, [r8+64] - vbroadcasti32x4 zmm21, [r8+80] - vbroadcasti32x4 zmm22, [r8+96] - vbroadcasti32x4 zmm23, [r8+112] - vbroadcasti32x4 zmm24, [r8+128] - vbroadcasti32x4 zmm25, [r8+144] - vbroadcasti32x4 zmm26, [r8+160] + vbroadcasti32x4 zmm16, OWORD PTR [r8] + vbroadcasti32x4 zmm17, OWORD PTR [r8+16] + vbroadcasti32x4 zmm18, OWORD PTR [r8+32] + vbroadcasti32x4 zmm19, OWORD PTR [r8+48] + vbroadcasti32x4 zmm20, OWORD PTR [r8+64] + vbroadcasti32x4 zmm21, OWORD PTR [r8+80] + vbroadcasti32x4 zmm22, OWORD PTR [r8+96] + vbroadcasti32x4 zmm23, OWORD PTR [r8+112] + vbroadcasti32x4 zmm24, OWORD PTR [r8+128] + vbroadcasti32x4 zmm25, OWORD PTR [r8+144] + vbroadcasti32x4 zmm26, OWORD PTR [r8+160] cmp r10d, 11 jl L_AES_XTS_encrypt_avx512_key_cached - vbroadcasti32x4 zmm27, [r8+176] - vbroadcasti32x4 zmm28, [r8+192] + vbroadcasti32x4 zmm27, OWORD PTR [r8+176] + vbroadcasti32x4 zmm28, OWORD PTR [r8+192] cmp r10d, 13 jl L_AES_XTS_encrypt_avx512_key_cached - vbroadcasti32x4 zmm29, [r8+208] - vbroadcasti32x4 zmm30, [r8+224] + vbroadcasti32x4 zmm29, OWORD PTR [r8+208] + vbroadcasti32x4 zmm30, OWORD PTR [r8+224] L_AES_XTS_encrypt_avx512_key_cached: cmp eax, 256 mov r11d, eax @@ -5665,25 +5655,25 @@ AES_XTS_encrypt_update_avx512 PROC xor r12d, r12d cmp eax, 32 jl L_AES_XTS_encrypt_update_avx512_done_128 - vbroadcasti32x4 zmm16, [r10] - vbroadcasti32x4 zmm17, [r10+16] - vbroadcasti32x4 zmm18, [r10+32] - vbroadcasti32x4 zmm19, [r10+48] - vbroadcasti32x4 zmm20, [r10+64] - vbroadcasti32x4 zmm21, [r10+80] - vbroadcasti32x4 zmm22, [r10+96] - vbroadcasti32x4 zmm23, [r10+112] - vbroadcasti32x4 zmm24, [r10+128] - vbroadcasti32x4 zmm25, [r10+144] - vbroadcasti32x4 zmm26, [r10+160] + vbroadcasti32x4 zmm16, OWORD PTR [r10] + vbroadcasti32x4 zmm17, OWORD PTR [r10+16] + vbroadcasti32x4 zmm18, OWORD PTR [r10+32] + vbroadcasti32x4 zmm19, OWORD PTR [r10+48] + vbroadcasti32x4 zmm20, OWORD PTR [r10+64] + vbroadcasti32x4 zmm21, OWORD PTR [r10+80] + vbroadcasti32x4 zmm22, OWORD PTR [r10+96] + vbroadcasti32x4 zmm23, OWORD PTR [r10+112] + vbroadcasti32x4 zmm24, OWORD PTR [r10+128] + vbroadcasti32x4 zmm25, OWORD PTR [r10+144] + vbroadcasti32x4 zmm26, OWORD PTR [r10+160] cmp r9d, 11 jl L_AES_XTS_encrypt_update_avx512_key_cached - vbroadcasti32x4 zmm27, [r10+176] - vbroadcasti32x4 zmm28, [r10+192] + vbroadcasti32x4 zmm27, OWORD PTR [r10+176] + vbroadcasti32x4 zmm28, OWORD PTR [r10+192] cmp r9d, 13 jl L_AES_XTS_encrypt_update_avx512_key_cached - vbroadcasti32x4 zmm29, [r10+208] - vbroadcasti32x4 zmm30, [r10+224] + vbroadcasti32x4 zmm29, OWORD PTR [r10+208] + vbroadcasti32x4 zmm30, OWORD PTR [r10+224] L_AES_XTS_encrypt_update_avx512_key_cached: cmp eax, 256 mov r11d, eax @@ -6196,25 +6186,25 @@ L_AES_XTS_decrypt_avx512_tweak_aes_enc_block_last: L_AES_XTS_decrypt_avx512_mul16_256: cmp r11d, 32 jl L_AES_XTS_decrypt_avx512_done_128 - vbroadcasti32x4 zmm16, [r8] - vbroadcasti32x4 zmm17, [r8+16] - vbroadcasti32x4 zmm18, [r8+32] - vbroadcasti32x4 zmm19, [r8+48] - vbroadcasti32x4 zmm20, [r8+64] - vbroadcasti32x4 zmm21, [r8+80] - vbroadcasti32x4 zmm22, [r8+96] - vbroadcasti32x4 zmm23, [r8+112] - vbroadcasti32x4 zmm24, [r8+128] - vbroadcasti32x4 zmm25, [r8+144] - vbroadcasti32x4 zmm26, [r8+160] + vbroadcasti32x4 zmm16, OWORD PTR [r8] + vbroadcasti32x4 zmm17, OWORD PTR [r8+16] + vbroadcasti32x4 zmm18, OWORD PTR [r8+32] + vbroadcasti32x4 zmm19, OWORD PTR [r8+48] + vbroadcasti32x4 zmm20, OWORD PTR [r8+64] + vbroadcasti32x4 zmm21, OWORD PTR [r8+80] + vbroadcasti32x4 zmm22, OWORD PTR [r8+96] + vbroadcasti32x4 zmm23, OWORD PTR [r8+112] + vbroadcasti32x4 zmm24, OWORD PTR [r8+128] + vbroadcasti32x4 zmm25, OWORD PTR [r8+144] + vbroadcasti32x4 zmm26, OWORD PTR [r8+160] cmp r10d, 11 jl L_AES_XTS_decrypt_avx512_key_cached - vbroadcasti32x4 zmm27, [r8+176] - vbroadcasti32x4 zmm28, [r8+192] + vbroadcasti32x4 zmm27, OWORD PTR [r8+176] + vbroadcasti32x4 zmm28, OWORD PTR [r8+192] cmp r10d, 13 jl L_AES_XTS_decrypt_avx512_key_cached - vbroadcasti32x4 zmm29, [r8+208] - vbroadcasti32x4 zmm30, [r8+224] + vbroadcasti32x4 zmm29, OWORD PTR [r8+208] + vbroadcasti32x4 zmm30, OWORD PTR [r8+224] L_AES_XTS_decrypt_avx512_key_cached: cmp r11d, 256 jl L_AES_XTS_decrypt_avx512_done_256 @@ -6766,25 +6756,25 @@ AES_XTS_decrypt_update_avx512 PROC L_AES_XTS_decrypt_update_avx512_mul16_256: cmp r11d, 32 jl L_AES_XTS_decrypt_update_avx512_done_128 - vbroadcasti32x4 zmm16, [r10] - vbroadcasti32x4 zmm17, [r10+16] - vbroadcasti32x4 zmm18, [r10+32] - vbroadcasti32x4 zmm19, [r10+48] - vbroadcasti32x4 zmm20, [r10+64] - vbroadcasti32x4 zmm21, [r10+80] - vbroadcasti32x4 zmm22, [r10+96] - vbroadcasti32x4 zmm23, [r10+112] - vbroadcasti32x4 zmm24, [r10+128] - vbroadcasti32x4 zmm25, [r10+144] - vbroadcasti32x4 zmm26, [r10+160] + vbroadcasti32x4 zmm16, OWORD PTR [r10] + vbroadcasti32x4 zmm17, OWORD PTR [r10+16] + vbroadcasti32x4 zmm18, OWORD PTR [r10+32] + vbroadcasti32x4 zmm19, OWORD PTR [r10+48] + vbroadcasti32x4 zmm20, OWORD PTR [r10+64] + vbroadcasti32x4 zmm21, OWORD PTR [r10+80] + vbroadcasti32x4 zmm22, OWORD PTR [r10+96] + vbroadcasti32x4 zmm23, OWORD PTR [r10+112] + vbroadcasti32x4 zmm24, OWORD PTR [r10+128] + vbroadcasti32x4 zmm25, OWORD PTR [r10+144] + vbroadcasti32x4 zmm26, OWORD PTR [r10+160] cmp r9d, 11 jl L_AES_XTS_decrypt_update_avx512_key_cached - vbroadcasti32x4 zmm27, [r10+176] - vbroadcasti32x4 zmm28, [r10+192] + vbroadcasti32x4 zmm27, OWORD PTR [r10+176] + vbroadcasti32x4 zmm28, OWORD PTR [r10+192] cmp r9d, 13 jl L_AES_XTS_decrypt_update_avx512_key_cached - vbroadcasti32x4 zmm29, [r10+208] - vbroadcasti32x4 zmm30, [r10+224] + vbroadcasti32x4 zmm29, OWORD PTR [r10+208] + vbroadcasti32x4 zmm30, OWORD PTR [r10+224] L_AES_XTS_decrypt_update_avx512_key_cached: cmp r11d, 256 jl L_AES_XTS_decrypt_update_avx512_done_256 diff --git a/wolfcrypt/src/chacha_asm.asm b/wolfcrypt/src/chacha_asm.asm index b9444254c90..80afbfdb3b0 100644 --- a/wolfcrypt/src/chacha_asm.asm +++ b/wolfcrypt/src/chacha_asm.asm @@ -462,26 +462,22 @@ _TEXT ENDS IFDEF HAVE_INTEL_AVX1 _DATA SEGMENT ALIGN 16 -L_chacha20_avx1_rotl8 QWORD \ - 0605040702010003h, 0e0d0c0f0a09080bh +L_chacha20_avx1_rotl8 QWORD 0605040702010003h, 0e0d0c0f0a09080bh ptr_L_chacha20_avx1_rotl8 QWORD L_chacha20_avx1_rotl8 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_chacha20_avx1_rotl16 QWORD \ - 0504070601000302h, 0d0c0f0e09080b0ah +L_chacha20_avx1_rotl16 QWORD 0504070601000302h, 0d0c0f0e09080b0ah ptr_L_chacha20_avx1_rotl16 QWORD L_chacha20_avx1_rotl16 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_chacha20_avx1_add QWORD \ - 0000000100000000h, 0000000300000002h +L_chacha20_avx1_add QWORD 0000000100000000h, 0000000300000002h ptr_L_chacha20_avx1_add QWORD L_chacha20_avx1_add _DATA ENDS _DATA SEGMENT ALIGN 16 -L_chacha20_avx1_four QWORD \ - 0000000400000004h, 0000000400000004h +L_chacha20_avx1_four QWORD 0000000400000004h, 0000000400000004h ptr_L_chacha20_avx1_four QWORD L_chacha20_avx1_four _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -1019,30 +1015,26 @@ ENDIF IFDEF HAVE_INTEL_AVX2 _DATA SEGMENT ALIGN 16 -L_chacha20_avx2_rotl8 QWORD \ - 0605040702010003h, 0e0d0c0f0a09080bh, - 0605040702010003h, 0e0d0c0f0a09080bh +L_chacha20_avx2_rotl8 QWORD 0605040702010003h, 0e0d0c0f0a09080bh + QWORD 0605040702010003h, 0e0d0c0f0a09080bh ptr_L_chacha20_avx2_rotl8 QWORD L_chacha20_avx2_rotl8 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_chacha20_avx2_rotl16 QWORD \ - 0504070601000302h, 0d0c0f0e09080b0ah, - 0504070601000302h, 0d0c0f0e09080b0ah +L_chacha20_avx2_rotl16 QWORD 0504070601000302h, 0d0c0f0e09080b0ah + QWORD 0504070601000302h, 0d0c0f0e09080b0ah ptr_L_chacha20_avx2_rotl16 QWORD L_chacha20_avx2_rotl16 _DATA ENDS _DATA SEGMENT ALIGN 16 -L_chacha20_avx2_add QWORD \ - 0000000100000000h, 0000000300000002h, - 0000000500000004h, 0000000700000006h +L_chacha20_avx2_add QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000500000004h, 0000000700000006h ptr_L_chacha20_avx2_add QWORD L_chacha20_avx2_add _DATA ENDS _DATA SEGMENT ALIGN 16 -L_chacha20_avx2_eight QWORD \ - 0000000800000008h, 0000000800000008h, - 0000000800000008h, 0000000800000008h +L_chacha20_avx2_eight QWORD 0000000800000008h, 0000000800000008h + QWORD 0000000800000008h, 0000000800000008h ptr_L_chacha20_avx2_eight QWORD L_chacha20_avx2_eight _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -1093,29 +1085,29 @@ chacha_encrypt_avx2 PROC vpbroadcastd ymm14, DWORD PTR [rcx+56] vpbroadcastd ymm15, DWORD PTR [rcx+60] vpaddd ymm12, ymm12, YMMWORD PTR [r15] - vmovdqa YMMWORD PTR [r11], ymm0 - vmovdqa YMMWORD PTR [r11+32], ymm1 - vmovdqa YMMWORD PTR [r11+64], ymm2 - vmovdqa YMMWORD PTR [r11+96], ymm3 - vmovdqa YMMWORD PTR [r11+128], ymm4 - vmovdqa YMMWORD PTR [r11+160], ymm5 - vmovdqa YMMWORD PTR [r11+192], ymm6 - vmovdqa YMMWORD PTR [r11+224], ymm7 - vmovdqa YMMWORD PTR [r11+256], ymm8 - vmovdqa YMMWORD PTR [r11+288], ymm9 - vmovdqa YMMWORD PTR [r11+320], ymm10 - vmovdqa YMMWORD PTR [r11+352], ymm11 - vmovdqa YMMWORD PTR [r11+384], ymm12 - vmovdqa YMMWORD PTR [r11+416], ymm13 - vmovdqa YMMWORD PTR [r11+448], ymm14 - vmovdqa YMMWORD PTR [r11+480], ymm15 + vmovdqu YMMWORD PTR [r11], ymm0 + vmovdqu YMMWORD PTR [r11+32], ymm1 + vmovdqu YMMWORD PTR [r11+64], ymm2 + vmovdqu YMMWORD PTR [r11+96], ymm3 + vmovdqu YMMWORD PTR [r11+128], ymm4 + vmovdqu YMMWORD PTR [r11+160], ymm5 + vmovdqu YMMWORD PTR [r11+192], ymm6 + vmovdqu YMMWORD PTR [r11+224], ymm7 + vmovdqu YMMWORD PTR [r11+256], ymm8 + vmovdqu YMMWORD PTR [r11+288], ymm9 + vmovdqu YMMWORD PTR [r11+320], ymm10 + vmovdqu YMMWORD PTR [r11+352], ymm11 + vmovdqu YMMWORD PTR [r11+384], ymm12 + vmovdqu YMMWORD PTR [r11+416], ymm13 + vmovdqu YMMWORD PTR [r11+448], ymm14 + vmovdqu YMMWORD PTR [r11+480], ymm15 L_chacha20_avx2_start256: mov r10b, 10 - vmovdqa YMMWORD PTR [r12+96], ymm11 + vmovdqu YMMWORD PTR [r12+96], ymm11 L_chacha20_avx2_loop256: vpaddd ymm0, ymm0, ymm4 vpxor ymm12, ymm12, ymm0 - vmovdqa ymm11, YMMWORD PTR [r12+96] + vmovdqu ymm11, YMMWORD PTR [r12+96] vpshufb ymm12, ymm12, YMMWORD PTR [r14] vpaddd ymm8, ymm8, ymm12 vpxor ymm4, ymm4, ymm8 @@ -1134,7 +1126,7 @@ L_chacha20_avx2_loop256: vpshufb ymm15, ymm15, YMMWORD PTR [r14] vpaddd ymm11, ymm11, ymm15 vpxor ymm7, ymm7, ymm11 - vmovdqa YMMWORD PTR [r12+96], ymm11 + vmovdqu YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm4, 20 vpslld ymm4, ymm4, 12 vpxor ymm4, ymm4, ymm11 @@ -1149,7 +1141,7 @@ L_chacha20_avx2_loop256: vpxor ymm7, ymm7, ymm11 vpaddd ymm0, ymm0, ymm4 vpxor ymm12, ymm12, ymm0 - vmovdqa ymm11, YMMWORD PTR [r12+96] + vmovdqu ymm11, YMMWORD PTR [r12+96] vpshufb ymm12, ymm12, YMMWORD PTR [r13] vpaddd ymm8, ymm8, ymm12 vpxor ymm4, ymm4, ymm8 @@ -1168,7 +1160,7 @@ L_chacha20_avx2_loop256: vpshufb ymm15, ymm15, YMMWORD PTR [r13] vpaddd ymm11, ymm11, ymm15 vpxor ymm7, ymm7, ymm11 - vmovdqa YMMWORD PTR [r12+96], ymm11 + vmovdqu YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm4, 25 vpslld ymm4, ymm4, 7 vpxor ymm4, ymm4, ymm11 @@ -1183,7 +1175,7 @@ L_chacha20_avx2_loop256: vpxor ymm7, ymm7, ymm11 vpaddd ymm0, ymm0, ymm5 vpxor ymm15, ymm15, ymm0 - vmovdqa ymm11, YMMWORD PTR [r12+96] + vmovdqu ymm11, YMMWORD PTR [r12+96] vpshufb ymm15, ymm15, YMMWORD PTR [r14] vpaddd ymm10, ymm10, ymm15 vpxor ymm5, ymm5, ymm10 @@ -1202,7 +1194,7 @@ L_chacha20_avx2_loop256: vpshufb ymm14, ymm14, YMMWORD PTR [r14] vpaddd ymm9, ymm9, ymm14 vpxor ymm4, ymm4, ymm9 - vmovdqa YMMWORD PTR [r12+96], ymm11 + vmovdqu YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm5, 20 vpslld ymm5, ymm5, 12 vpxor ymm5, ymm5, ymm11 @@ -1217,7 +1209,7 @@ L_chacha20_avx2_loop256: vpxor ymm4, ymm4, ymm11 vpaddd ymm0, ymm0, ymm5 vpxor ymm15, ymm15, ymm0 - vmovdqa ymm11, YMMWORD PTR [r12+96] + vmovdqu ymm11, YMMWORD PTR [r12+96] vpshufb ymm15, ymm15, YMMWORD PTR [r13] vpaddd ymm10, ymm10, ymm15 vpxor ymm5, ymm5, ymm10 @@ -1236,7 +1228,7 @@ L_chacha20_avx2_loop256: vpshufb ymm14, ymm14, YMMWORD PTR [r13] vpaddd ymm9, ymm9, ymm14 vpxor ymm4, ymm4, ymm9 - vmovdqa YMMWORD PTR [r12+96], ymm11 + vmovdqu YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm5, 25 vpslld ymm5, ymm5, 7 vpxor ymm5, ymm5, ymm11 @@ -1251,7 +1243,7 @@ L_chacha20_avx2_loop256: vpxor ymm4, ymm4, ymm11 dec r10b jnz L_chacha20_avx2_loop256 - vmovdqa ymm11, YMMWORD PTR [r12+96] + vmovdqu ymm11, YMMWORD PTR [r12+96] vpaddd ymm0, ymm0, YMMWORD PTR [r11] vpaddd ymm1, ymm1, YMMWORD PTR [r11+32] vpaddd ymm2, ymm2, YMMWORD PTR [r11+64] @@ -1268,14 +1260,14 @@ L_chacha20_avx2_loop256: vpaddd ymm13, ymm13, YMMWORD PTR [r11+416] vpaddd ymm14, ymm14, YMMWORD PTR [r11+448] vpaddd ymm15, ymm15, YMMWORD PTR [r11+480] - vmovdqa YMMWORD PTR [r12], ymm8 - vmovdqa YMMWORD PTR [r12+32], ymm9 - vmovdqa YMMWORD PTR [r12+64], ymm10 - vmovdqa YMMWORD PTR [r12+96], ymm11 - vmovdqa YMMWORD PTR [r12+128], ymm12 - vmovdqa YMMWORD PTR [r12+160], ymm13 - vmovdqa YMMWORD PTR [r12+192], ymm14 - vmovdqa YMMWORD PTR [r12+224], ymm15 + vmovdqu YMMWORD PTR [r12], ymm8 + vmovdqu YMMWORD PTR [r12+32], ymm9 + vmovdqu YMMWORD PTR [r12+64], ymm10 + vmovdqu YMMWORD PTR [r12+96], ymm11 + vmovdqu YMMWORD PTR [r12+128], ymm12 + vmovdqu YMMWORD PTR [r12+160], ymm13 + vmovdqu YMMWORD PTR [r12+192], ymm14 + vmovdqu YMMWORD PTR [r12+224], ymm15 vpunpckldq ymm8, ymm0, ymm1 vpunpckldq ymm9, ymm2, ymm3 vpunpckhdq ymm12, ymm0, ymm1 @@ -1324,14 +1316,14 @@ L_chacha20_avx2_loop256: vmovdqu YMMWORD PTR [r8+320], ymm13 vmovdqu YMMWORD PTR [r8+384], ymm14 vmovdqu YMMWORD PTR [r8+448], ymm15 - vmovdqa ymm0, YMMWORD PTR [r12] - vmovdqa ymm1, YMMWORD PTR [r12+32] - vmovdqa ymm2, YMMWORD PTR [r12+64] - vmovdqa ymm3, YMMWORD PTR [r12+96] - vmovdqa ymm4, YMMWORD PTR [r12+128] - vmovdqa ymm5, YMMWORD PTR [r12+160] - vmovdqa ymm6, YMMWORD PTR [r12+192] - vmovdqa ymm7, YMMWORD PTR [r12+224] + vmovdqu ymm0, YMMWORD PTR [r12] + vmovdqu ymm1, YMMWORD PTR [r12+32] + vmovdqu ymm2, YMMWORD PTR [r12+64] + vmovdqu ymm3, YMMWORD PTR [r12+96] + vmovdqu ymm4, YMMWORD PTR [r12+128] + vmovdqu ymm5, YMMWORD PTR [r12+160] + vmovdqu ymm6, YMMWORD PTR [r12+192] + vmovdqu ymm7, YMMWORD PTR [r12+224] vpunpckldq ymm8, ymm0, ymm1 vpunpckldq ymm9, ymm2, ymm3 vpunpckhdq ymm12, ymm0, ymm1 @@ -1380,30 +1372,30 @@ L_chacha20_avx2_loop256: vmovdqu YMMWORD PTR [r8+352], ymm13 vmovdqu YMMWORD PTR [r8+416], ymm14 vmovdqu YMMWORD PTR [r8+480], ymm15 - vmovdqa ymm12, YMMWORD PTR [r11+384] + vmovdqu ymm12, YMMWORD PTR [r11+384] add rdx, 512 add r8, 512 vpaddd ymm12, ymm12, YMMWORD PTR [rdi] sub r9d, 512 - vmovdqa YMMWORD PTR [r11+384], ymm12 + vmovdqu YMMWORD PTR [r11+384], ymm12 cmp r9d, 512 jl L_chacha20_avx2_done256 - vmovdqa ymm0, YMMWORD PTR [r11] - vmovdqa ymm1, YMMWORD PTR [r11+32] - vmovdqa ymm2, YMMWORD PTR [r11+64] - vmovdqa ymm3, YMMWORD PTR [r11+96] - vmovdqa ymm4, YMMWORD PTR [r11+128] - vmovdqa ymm5, YMMWORD PTR [r11+160] - vmovdqa ymm6, YMMWORD PTR [r11+192] - vmovdqa ymm7, YMMWORD PTR [r11+224] - vmovdqa ymm8, YMMWORD PTR [r11+256] - vmovdqa ymm9, YMMWORD PTR [r11+288] - vmovdqa ymm10, YMMWORD PTR [r11+320] - vmovdqa ymm11, YMMWORD PTR [r11+352] - vmovdqa ymm12, YMMWORD PTR [r11+384] - vmovdqa ymm13, YMMWORD PTR [r11+416] - vmovdqa ymm14, YMMWORD PTR [r11+448] - vmovdqa ymm15, YMMWORD PTR [r11+480] + vmovdqu ymm0, YMMWORD PTR [r11] + vmovdqu ymm1, YMMWORD PTR [r11+32] + vmovdqu ymm2, YMMWORD PTR [r11+64] + vmovdqu ymm3, YMMWORD PTR [r11+96] + vmovdqu ymm4, YMMWORD PTR [r11+128] + vmovdqu ymm5, YMMWORD PTR [r11+160] + vmovdqu ymm6, YMMWORD PTR [r11+192] + vmovdqu ymm7, YMMWORD PTR [r11+224] + vmovdqu ymm8, YMMWORD PTR [r11+256] + vmovdqu ymm9, YMMWORD PTR [r11+288] + vmovdqu ymm10, YMMWORD PTR [r11+320] + vmovdqu ymm11, YMMWORD PTR [r11+352] + vmovdqu ymm12, YMMWORD PTR [r11+384] + vmovdqu ymm13, YMMWORD PTR [r11+416] + vmovdqu ymm14, YMMWORD PTR [r11+448] + vmovdqu ymm15, YMMWORD PTR [r11+480] jmp L_chacha20_avx2_start256 L_chacha20_avx2_done256: shl eax, 3 diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 7e976fa1f28..d402203fc3c 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -2341,6 +2341,9 @@ _fe_invert_x64: movq 128(%rsp), %rdi addq $0x90, %rsp repz retq +#ifndef __APPLE__ +.size fe_invert_x64,.-fe_invert_x64 +#endif /* __APPLE__ */ #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data @@ -7339,6 +7342,9 @@ _fe_pow22523_x64: addq $0x70, %rsp repz retq #ifndef __APPLE__ +.size fe_pow22523_x64,.-fe_pow22523_x64 +#endif /* __APPLE__ */ +#ifndef __APPLE__ .text .globl ge_p1p1_to_p2_x64 .type ge_p1p1_to_p2_x64,@function @@ -13201,6 +13207,9 @@ _fe_invert_avx2: movq 128(%rsp), %rdi addq $0x90, %rsp repz retq +#ifndef __APPLE__ +.size fe_invert_avx2,.-fe_invert_avx2 +#endif /* __APPLE__ */ #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data @@ -17403,6 +17412,9 @@ _fe_pow22523_avx2: addq $0x70, %rsp repz retq #ifndef __APPLE__ +.size fe_pow22523_avx2,.-fe_pow22523_avx2 +#endif /* __APPLE__ */ +#ifndef __APPLE__ .text .globl ge_p1p1_to_p2_avx2 .type ge_p1p1_to_p2_avx2,@function diff --git a/wolfcrypt/src/fe_x25519_asm.asm b/wolfcrypt/src/fe_x25519_asm.asm new file mode 100644 index 00000000000..fa3e671ba13 --- /dev/null +++ b/wolfcrypt/src/fe_x25519_asm.asm @@ -0,0 +1,19760 @@ +; /* fe_x25519_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +EXTERN cpuid_get_flags:PROC +_TEXT SEGMENT READONLY PARA +fe_init PROC +IFDEF HAVE_INTEL_AVX2 + mov eax, DWORD PTR [cpuFlagsSet] + test eax, eax + je L_fe_init_get_flags + ret +L_fe_init_get_flags: + sub rsp, 40 + call cpuid_get_flags + add rsp, 40 + mov DWORD PTR [intelFlags], eax + and eax, 80 + cmp eax, 80 + jne L_fe_init_flags_done + lea rax, [fe_cmov_table_avx2] + mov QWORD PTR [fe_cmov_table_p], rax + lea rax, [fe_mul_avx2] + mov QWORD PTR [fe_mul_p], rax + lea rax, [fe_sq_avx2] + mov QWORD PTR [fe_sq_p], rax + lea rax, [fe_mul121666_avx2] + mov QWORD PTR [fe_mul121666_p], rax + lea rax, [fe_invert_avx2] + mov QWORD PTR [fe_invert_p], rax + lea rax, [curve25519_avx2] + mov QWORD PTR [curve25519_p], rax + lea rax, [fe_pow22523_avx2] + mov QWORD PTR [fe_pow22523_p], rax + lea rax, [ge_p1p1_to_p2_avx2] + mov QWORD PTR [ge_p1p1_to_p2_p], rax + lea rax, [ge_p1p1_to_p3_avx2] + mov QWORD PTR [ge_p1p1_to_p3_p], rax + lea rax, [ge_p2_dbl_avx2] + mov QWORD PTR [ge_p2_dbl_p], rax + lea rax, [ge_madd_avx2] + mov QWORD PTR [ge_madd_p], rax + lea rax, [ge_msub_avx2] + mov QWORD PTR [ge_msub_p], rax + lea rax, [ge_add_avx2] + mov QWORD PTR [ge_add_p], rax + lea rax, [ge_sub_avx2] + mov QWORD PTR [ge_sub_p], rax +IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519 + lea rax, [curve25519_base_avx2] + mov QWORD PTR [curve25519_base_p], rax +ENDIF +IFDEF HAVE_ED25519 + lea rax, [fe_sq2_avx2] + mov QWORD PTR [fe_sq2_p], rax + lea rax, [fe_invert_nct_avx2] + mov QWORD PTR [fe_invert_nct_p], rax + lea rax, [sc_reduce_avx2] + mov QWORD PTR [sc_reduce_p], rax + lea rax, [sc_muladd_avx2] + mov QWORD PTR [sc_muladd_p], rax +ENDIF +L_fe_init_flags_done: + mov DWORD PTR [cpuFlagsSet], 1 +ENDIF + ret +fe_init ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_frombytes PROC + mov r11, 9223372036854775807 + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + and r10, r11 + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], r8 + mov QWORD PTR [rcx+16], r9 + mov QWORD PTR [rcx+24], r10 + ret +fe_frombytes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_tobytes PROC + push r12 + mov r12, 9223372036854775807 + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + add rax, 19 + adc r8, 0 + adc r9, 0 + adc r10, 0 + shr r10, 63 + imul r11, r10, 19 + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + add rax, r11 + adc r8, 0 + adc r9, 0 + adc r10, 0 + and r10, r12 + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], r8 + mov QWORD PTR [rcx+16], r9 + mov QWORD PTR [rcx+24], r10 + pop r12 + ret +fe_tobytes ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_1 PROC + ; Set one + mov QWORD PTR [rcx], 1 + mov QWORD PTR [rcx+8], 0 + mov QWORD PTR [rcx+16], 0 + mov QWORD PTR [rcx+24], 0 + ret +fe_1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_0 PROC + ; Set zero + mov QWORD PTR [rcx], 0 + mov QWORD PTR [rcx+8], 0 + mov QWORD PTR [rcx+16], 0 + mov QWORD PTR [rcx+24], 0 + ret +fe_0 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_copy PROC + ; Copy + mov rax, QWORD PTR [rdx] + mov r8, QWORD PTR [rdx+8] + mov r9, QWORD PTR [rdx+16] + mov r10, QWORD PTR [rdx+24] + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], r8 + mov QWORD PTR [rcx+16], r9 + mov QWORD PTR [rcx+24], r10 + ret +fe_copy ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_sub PROC + push r12 + ; Sub + mov rax, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + mov r10, QWORD PTR [rdx+16] + mov r11, QWORD PTR [rdx+24] + sub rax, QWORD PTR [r8] + sbb r9, QWORD PTR [r8+8] + sbb r10, QWORD PTR [r8+16] + sbb r11, QWORD PTR [r8+24] + sbb r12, r12 + shld r12, r11, 1 + imul r12, -19 + btr r11, 63 + ; Add modulus (if underflow) + sub rax, r12 + sbb r9, 0 + sbb r10, 0 + sbb r11, 0 + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], r9 + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + pop r12 + ret +fe_sub ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_add PROC + push r12 + ; Add + mov rax, QWORD PTR [rdx] + mov r9, QWORD PTR [rdx+8] + add rax, QWORD PTR [r8] + mov r10, QWORD PTR [rdx+16] + adc r9, QWORD PTR [r8+8] + mov r11, QWORD PTR [rdx+24] + adc r10, QWORD PTR [r8+16] + adc r11, QWORD PTR [r8+24] + mov r12, 0 + adc r12, 0 + shld r12, r11, 1 + imul r12, 19 + btr r11, 63 + ; Sub modulus (if overflow) + add rax, r12 + adc r9, 0 + adc r10, 0 + adc r11, 0 + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], r9 + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + pop r12 + ret +fe_add ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_neg PROC + mov rax, -19 + mov r8, -1 + mov r9, -1 + mov r10, 9223372036854775807 + sub rax, QWORD PTR [rdx] + sbb r8, QWORD PTR [rdx+8] + sbb r9, QWORD PTR [rdx+16] + sbb r10, QWORD PTR [rdx+24] + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], r8 + mov QWORD PTR [rcx+16], r9 + mov QWORD PTR [rcx+24], r10 + ret +fe_neg ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_cmov PROC + push r12 + cmp r8d, 1 + mov r9, QWORD PTR [rcx] + mov r10, QWORD PTR [rcx+8] + mov r11, QWORD PTR [rcx+16] + mov r12, QWORD PTR [rcx+24] + cmove r9, QWORD PTR [rdx] + cmove r10, QWORD PTR [rdx+8] + cmove r11, QWORD PTR [rdx+16] + cmove r12, QWORD PTR [rdx+24] + mov QWORD PTR [rcx], r9 + mov QWORD PTR [rcx+8], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 + pop r12 + ret +fe_cmov ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_isnonzero PROC + mov r11, 9223372036854775807 + mov rax, QWORD PTR [rcx] + mov rdx, QWORD PTR [rcx+8] + mov r8, QWORD PTR [rcx+16] + mov r9, QWORD PTR [rcx+24] + add rax, 19 + adc rdx, 0 + adc r8, 0 + adc r9, 0 + shr r9, 63 + imul r10, r9, 19 + mov rax, QWORD PTR [rcx] + mov rdx, QWORD PTR [rcx+8] + mov r8, QWORD PTR [rcx+16] + mov r9, QWORD PTR [rcx+24] + add rax, r10 + adc rdx, 0 + adc r8, 0 + adc r9, 0 + and r9, r11 + or rax, rdx + or rax, r8 + or rax, r9 + ret +fe_isnonzero ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_isnegative PROC + push r12 + mov r12, 9223372036854775807 + mov rdx, QWORD PTR [rcx] + mov r8, QWORD PTR [rcx+8] + mov r9, QWORD PTR [rcx+16] + mov r10, QWORD PTR [rcx+24] + mov rax, rdx + add rdx, 19 + adc r8, 0 + adc r9, 0 + adc r10, 0 + shr r10, 63 + imul r11, r10, 19 + add rax, r11 + and rax, 1 + pop r12 + ret +fe_isnegative ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_cmov_table PROC + jmp QWORD PTR [fe_cmov_table_p] +fe_cmov_table ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_mul PROC + jmp QWORD PTR [fe_mul_p] +fe_mul ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_sq PROC + jmp QWORD PTR [fe_sq_p] +fe_sq ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_mul121666 PROC + jmp QWORD PTR [fe_mul121666_p] +fe_mul121666 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_invert PROC + jmp QWORD PTR [fe_invert_p] +fe_invert ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +curve25519 PROC + jmp QWORD PTR [curve25519_p] +curve25519 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_pow22523 PROC + jmp QWORD PTR [fe_pow22523_p] +fe_pow22523 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p1p1_to_p2 PROC + jmp QWORD PTR [ge_p1p1_to_p2_p] +ge_p1p1_to_p2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p1p1_to_p3 PROC + jmp QWORD PTR [ge_p1p1_to_p3_p] +ge_p1p1_to_p3 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p2_dbl PROC + jmp QWORD PTR [ge_p2_dbl_p] +ge_p2_dbl ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_madd PROC + jmp QWORD PTR [ge_madd_p] +ge_madd ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_msub PROC + jmp QWORD PTR [ge_msub_p] +ge_msub ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_add PROC + jmp QWORD PTR [ge_add_p] +ge_add ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_sub PROC + jmp QWORD PTR [ge_sub_p] +ge_sub ENDP +_TEXT ENDS +IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519 +IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519 +_TEXT SEGMENT READONLY PARA +curve25519_base PROC + jmp QWORD PTR [curve25519_base_p] +curve25519_base ENDP +_TEXT ENDS +ENDIF +ENDIF +IFDEF HAVE_ED25519 +IFDEF HAVE_ED25519 +_TEXT SEGMENT READONLY PARA +fe_sq2 PROC + jmp QWORD PTR [fe_sq2_p] +fe_sq2 ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_ED25519 +_TEXT SEGMENT READONLY PARA +fe_invert_nct PROC + jmp QWORD PTR [fe_invert_nct_p] +fe_invert_nct ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_ED25519 +_TEXT SEGMENT READONLY PARA +sc_reduce PROC + jmp QWORD PTR [sc_reduce_p] +sc_reduce ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_ED25519 +_TEXT SEGMENT READONLY PARA +sc_muladd PROC + jmp QWORD PTR [sc_muladd_p] +sc_muladd ENDP +_TEXT ENDS +ENDIF +ENDIF +_DATA SEGMENT +cpuFlagsSet dd 0 +_DATA ENDS +_DATA SEGMENT +intelFlags dd 0 +_DATA ENDS +_DATA SEGMENT +fe_cmov_table_p dq fe_cmov_table_x64 +_DATA ENDS +_DATA SEGMENT +fe_mul_p dq fe_mul_x64 +_DATA ENDS +_DATA SEGMENT +fe_sq_p dq fe_sq_x64 +_DATA ENDS +_DATA SEGMENT +fe_mul121666_p dq fe_mul121666_x64 +_DATA ENDS +_DATA SEGMENT +fe_invert_p dq fe_invert_x64 +_DATA ENDS +_DATA SEGMENT +curve25519_p dq curve25519_x64 +_DATA ENDS +_DATA SEGMENT +fe_pow22523_p dq fe_pow22523_x64 +_DATA ENDS +_DATA SEGMENT +ge_p1p1_to_p2_p dq ge_p1p1_to_p2_x64 +_DATA ENDS +_DATA SEGMENT +ge_p1p1_to_p3_p dq ge_p1p1_to_p3_x64 +_DATA ENDS +_DATA SEGMENT +ge_p2_dbl_p dq ge_p2_dbl_x64 +_DATA ENDS +_DATA SEGMENT +ge_madd_p dq ge_madd_x64 +_DATA ENDS +_DATA SEGMENT +ge_msub_p dq ge_msub_x64 +_DATA ENDS +_DATA SEGMENT +ge_add_p dq ge_add_x64 +_DATA ENDS +_DATA SEGMENT +ge_sub_p dq ge_sub_x64 +_DATA ENDS +IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519 +_DATA SEGMENT +curve25519_base_p dq curve25519_base_x64 +_DATA ENDS +ENDIF +IFDEF HAVE_ED25519 +_DATA SEGMENT +fe_sq2_p dq fe_sq2_x64 +_DATA ENDS +_DATA SEGMENT +fe_invert_nct_p dq fe_invert_nct_x64 +_DATA ENDS +_DATA SEGMENT +sc_reduce_p dq sc_reduce_x64 +_DATA ENDS +_DATA SEGMENT +sc_muladd_p dq sc_muladd_x64 +_DATA ENDS +ENDIF +_TEXT SEGMENT READONLY PARA +fe_cmov_table_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov r9, rdx + movsx rax, r8b + cdq + xor al, dl + sub al, dl + mov sil, al + mov rax, 1 + xor rdx, rdx + xor r10, r10 + xor r11, r11 + mov r12, 1 + xor r13, r13 + xor r14, r14 + xor r15, r15 + cmp sil, 1 + mov rdi, QWORD PTR [r9] + cmove rax, rdi + mov rdi, QWORD PTR [r9+8] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+16] + cmove r10, rdi + mov rdi, QWORD PTR [r9+24] + cmove r11, rdi + mov rdi, QWORD PTR [r9+32] + cmove r12, rdi + mov rdi, QWORD PTR [r9+40] + cmove r13, rdi + mov rdi, QWORD PTR [r9+48] + cmove r14, rdi + mov rdi, QWORD PTR [r9+56] + cmove r15, rdi + cmp sil, 2 + mov rdi, QWORD PTR [r9+96] + cmove rax, rdi + mov rdi, QWORD PTR [r9+104] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+112] + cmove r10, rdi + mov rdi, QWORD PTR [r9+120] + cmove r11, rdi + mov rdi, QWORD PTR [r9+128] + cmove r12, rdi + mov rdi, QWORD PTR [r9+136] + cmove r13, rdi + mov rdi, QWORD PTR [r9+144] + cmove r14, rdi + mov rdi, QWORD PTR [r9+152] + cmove r15, rdi + cmp sil, 3 + mov rdi, QWORD PTR [r9+192] + cmove rax, rdi + mov rdi, QWORD PTR [r9+200] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+208] + cmove r10, rdi + mov rdi, QWORD PTR [r9+216] + cmove r11, rdi + mov rdi, QWORD PTR [r9+224] + cmove r12, rdi + mov rdi, QWORD PTR [r9+232] + cmove r13, rdi + mov rdi, QWORD PTR [r9+240] + cmove r14, rdi + mov rdi, QWORD PTR [r9+248] + cmove r15, rdi + cmp sil, 4 + mov rdi, QWORD PTR [r9+288] + cmove rax, rdi + mov rdi, QWORD PTR [r9+296] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+304] + cmove r10, rdi + mov rdi, QWORD PTR [r9+312] + cmove r11, rdi + mov rdi, QWORD PTR [r9+320] + cmove r12, rdi + mov rdi, QWORD PTR [r9+328] + cmove r13, rdi + mov rdi, QWORD PTR [r9+336] + cmove r14, rdi + mov rdi, QWORD PTR [r9+344] + cmove r15, rdi + cmp sil, 5 + mov rdi, QWORD PTR [r9+384] + cmove rax, rdi + mov rdi, QWORD PTR [r9+392] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+400] + cmove r10, rdi + mov rdi, QWORD PTR [r9+408] + cmove r11, rdi + mov rdi, QWORD PTR [r9+416] + cmove r12, rdi + mov rdi, QWORD PTR [r9+424] + cmove r13, rdi + mov rdi, QWORD PTR [r9+432] + cmove r14, rdi + mov rdi, QWORD PTR [r9+440] + cmove r15, rdi + cmp sil, 6 + mov rdi, QWORD PTR [r9+480] + cmove rax, rdi + mov rdi, QWORD PTR [r9+488] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+496] + cmove r10, rdi + mov rdi, QWORD PTR [r9+504] + cmove r11, rdi + mov rdi, QWORD PTR [r9+512] + cmove r12, rdi + mov rdi, QWORD PTR [r9+520] + cmove r13, rdi + mov rdi, QWORD PTR [r9+528] + cmove r14, rdi + mov rdi, QWORD PTR [r9+536] + cmove r15, rdi + cmp sil, 7 + mov rdi, QWORD PTR [r9+576] + cmove rax, rdi + mov rdi, QWORD PTR [r9+584] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+592] + cmove r10, rdi + mov rdi, QWORD PTR [r9+600] + cmove r11, rdi + mov rdi, QWORD PTR [r9+608] + cmove r12, rdi + mov rdi, QWORD PTR [r9+616] + cmove r13, rdi + mov rdi, QWORD PTR [r9+624] + cmove r14, rdi + mov rdi, QWORD PTR [r9+632] + cmove r15, rdi + cmp sil, 8 + mov rdi, QWORD PTR [r9+672] + cmove rax, rdi + mov rdi, QWORD PTR [r9+680] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+688] + cmove r10, rdi + mov rdi, QWORD PTR [r9+696] + cmove r11, rdi + mov rdi, QWORD PTR [r9+704] + cmove r12, rdi + mov rdi, QWORD PTR [r9+712] + cmove r13, rdi + mov rdi, QWORD PTR [r9+720] + cmove r14, rdi + mov rdi, QWORD PTR [r9+728] + cmove r15, rdi + cmp r8b, 0 + mov rdi, rax + cmovl rax, r12 + cmovl r12, rdi + mov rdi, rdx + cmovl rdx, r13 + cmovl r13, rdi + mov rdi, r10 + cmovl r10, r14 + cmovl r14, rdi + mov rdi, r11 + cmovl r11, r15 + cmovl r15, rdi + mov QWORD PTR [rcx], rax + mov QWORD PTR [rcx+8], rdx + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + mov QWORD PTR [rcx+32], r12 + mov QWORD PTR [rcx+40], r13 + mov QWORD PTR [rcx+48], r14 + mov QWORD PTR [rcx+56], r15 + xor rax, rax + xor rdx, rdx + xor r10, r10 + xor r11, r11 + cmp sil, 1 + mov rdi, QWORD PTR [r9+64] + cmove rax, rdi + mov rdi, QWORD PTR [r9+72] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+80] + cmove r10, rdi + mov rdi, QWORD PTR [r9+88] + cmove r11, rdi + cmp sil, 2 + mov rdi, QWORD PTR [r9+160] + cmove rax, rdi + mov rdi, QWORD PTR [r9+168] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+176] + cmove r10, rdi + mov rdi, QWORD PTR [r9+184] + cmove r11, rdi + cmp sil, 3 + mov rdi, QWORD PTR [r9+256] + cmove rax, rdi + mov rdi, QWORD PTR [r9+264] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+272] + cmove r10, rdi + mov rdi, QWORD PTR [r9+280] + cmove r11, rdi + cmp sil, 4 + mov rdi, QWORD PTR [r9+352] + cmove rax, rdi + mov rdi, QWORD PTR [r9+360] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+368] + cmove r10, rdi + mov rdi, QWORD PTR [r9+376] + cmove r11, rdi + cmp sil, 5 + mov rdi, QWORD PTR [r9+448] + cmove rax, rdi + mov rdi, QWORD PTR [r9+456] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+464] + cmove r10, rdi + mov rdi, QWORD PTR [r9+472] + cmove r11, rdi + cmp sil, 6 + mov rdi, QWORD PTR [r9+544] + cmove rax, rdi + mov rdi, QWORD PTR [r9+552] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+560] + cmove r10, rdi + mov rdi, QWORD PTR [r9+568] + cmove r11, rdi + cmp sil, 7 + mov rdi, QWORD PTR [r9+640] + cmove rax, rdi + mov rdi, QWORD PTR [r9+648] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+656] + cmove r10, rdi + mov rdi, QWORD PTR [r9+664] + cmove r11, rdi + cmp sil, 8 + mov rdi, QWORD PTR [r9+736] + cmove rax, rdi + mov rdi, QWORD PTR [r9+744] + cmove rdx, rdi + mov rdi, QWORD PTR [r9+752] + cmove r10, rdi + mov rdi, QWORD PTR [r9+760] + cmove r11, rdi + mov r12, -19 + mov r13, -1 + mov r14, -1 + mov r15, 9223372036854775807 + sub r12, rax + sbb r13, rdx + sbb r14, r10 + sbb r15, r11 + cmp r8b, 0 + cmovl rax, r12 + cmovl rdx, r13 + cmovl r10, r14 + cmovl r11, r15 + mov QWORD PTR [rcx+64], rax + mov QWORD PTR [rcx+72], rdx + mov QWORD PTR [rcx+80], r10 + mov QWORD PTR [rcx+88], r11 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_cmov_table_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_mul_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r9, rdx + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r9] + mov r10, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r9] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r9+8] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r9] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r9+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r9+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r9] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r9+8] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r9+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r9+24] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r9+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r9+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r9+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r9+16] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r9+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [r9+24] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbx + mov rbx, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add r10, rbx + adc r11, r14 + adc r12, r15 + adc r13, rdi + mov rbx, 9223372036854775807 + mov rax, r13 + sar rax, 63 + and rax, 19 + and r13, rbx + add r10, rax + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Store + mov QWORD PTR [rcx], r10 + mov QWORD PTR [rcx+8], r11 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_mul_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_sq_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov r8, rdx + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+8] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+16] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+24] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+16] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+24] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + mov r9, rax + mov rsi, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r10, rsi + adc r11, rax + adc rdx, 0 + mov rsi, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r12, rsi + adc r13, rax + adc rdx, 0 + mov rsi, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add r15, rax + adc rdi, rdx + add r14, rsi + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rsi, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rsi + mov rsi, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add r9, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add r9, rsi + adc r10, r13 + adc r11, r14 + adc r12, r15 + mov rsi, 9223372036854775807 + mov rax, r12 + sar rax, 63 + and rax, 19 + and r12, rsi + add r9, rax + adc r10, 0 + adc r11, 0 + adc r12, 0 + ; Store + mov QWORD PTR [rcx], r9 + mov QWORD PTR [rcx+8], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_sq_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_sq_n_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r9, rdx +L_fe_sq_n_x64: + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+8] + mov r11, rax + mov r12, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+16] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+24] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+16] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+24] + add r14, rax + adc r15, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r9+24] + xor rdi, rdi + add r15, rax + adc rdi, rdx + ; Double + xor rsi, rsi + add r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r9] + mul rax + mov r10, rax + mov rbx, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r9+8] + mul rax + add r11, rbx + adc r12, rax + adc rdx, 0 + mov rbx, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r9+16] + mul rax + add r13, rbx + adc r14, rax + adc rdx, 0 + mov rbx, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r9+24] + mul rax + add rdi, rax + adc rsi, rdx + add r15, rbx + adc rdi, 0 + adc rsi, 0 + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbx + mov rbx, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add r10, rbx + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rcx], r10 + mov QWORD PTR [rcx+8], r11 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + dec r8b + jnz L_fe_sq_n_x64 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_sq_n_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_mul121666_x64 PROC + push r12 + push r13 + push r14 + mov r8, rdx + ; Multiply by 121666 + mov rax, 121666 + mul QWORD PTR [r8] + xor r12, r12 + mov r10, rax + mov r11, rdx + mov rax, 121666 + mul QWORD PTR [r8+8] + xor r13, r13 + add r11, rax + adc r12, rdx + mov rax, 121666 + mul QWORD PTR [r8+16] + xor r14, r14 + add r12, rax + adc r13, rdx + mov rax, 121666 + mul QWORD PTR [r8+24] + mov r9, 9223372036854775807 + add r13, rax + adc r14, rdx + shld r14, r13, 1 + and r13, r9 + mov rax, 19 + mul r14 + add r10, rax + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov QWORD PTR [rcx], r10 + mov QWORD PTR [rcx+8], r11 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + pop r14 + pop r13 + pop r12 + ret +fe_mul121666_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_invert_x64 PROC + sub rsp, 144 + ; Invert + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], rdx + mov rcx, rsp + mov rdx, QWORD PTR [rsp+136] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + mov rdx, QWORD PTR [rsp+136] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + mov rcx, rsp + mov rdx, rsp + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 4 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 9 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 19 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 9 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 49 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 99 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 49 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 4 + call fe_sq_n_x64 + mov rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_x64 + mov rdx, QWORD PTR [rsp+136] + mov rcx, QWORD PTR [rsp+128] + add rsp, 144 + ret +fe_invert_x64 ENDP +_TEXT ENDS +IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519 +_DATA SEGMENT +ALIGN 16 +L_curve25519_base_x64_x2 QWORD 5cae469cdd684efbh, 8f3f5ced1e350b5ch + QWORD 0d9750c687d157114h, 20d342d51873f1b7h +ptr_L_curve25519_base_x64_x2 QWORD L_curve25519_base_x64_x2 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +curve25519_base_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r8, rcx + mov r9, rdx + sub rsp, 168 + xor rsi, rsi + mov QWORD PTR [rsp+160], r8 + ; Set base point x + mov QWORD PTR [r8], 9 + mov QWORD PTR [r8+8], 0 + mov QWORD PTR [r8+16], 0 + mov QWORD PTR [r8+24], 0 + ; Set one + mov QWORD PTR [rsp], 1 + mov QWORD PTR [rsp+8], 0 + mov QWORD PTR [rsp+16], 0 + mov QWORD PTR [rsp+24], 0 + mov rcx, QWORD PTR [ptr_L_curve25519_base_x64_x2] + mov r10, QWORD PTR [ptr_L_curve25519_base_x64_x2+8] + mov r11, QWORD PTR [ptr_L_curve25519_base_x64_x2+16] + mov r12, QWORD PTR [ptr_L_curve25519_base_x64_x2+24] + ; Set one + mov QWORD PTR [rsp+32], 1 + mov QWORD PTR [rsp+40], 0 + mov QWORD PTR [rsp+48], 0 + mov QWORD PTR [rsp+56], 0 + mov QWORD PTR [rsp+64], rcx + mov QWORD PTR [rsp+72], r10 + mov QWORD PTR [rsp+80], r11 + mov QWORD PTR [rsp+88], r12 + mov rbp, 253 +L_curve25519_base_x64_bits: + mov r10, rbp + mov rcx, rbp + and rcx, 63 + shr r10, 6 + mov rbx, QWORD PTR [r9+8*r10] + shr rbx, cl + and rbx, 1 + xor rsi, rbx + neg rsi + ; Conditional Swap + mov rcx, QWORD PTR [r8] + mov r10, QWORD PTR [r8+8] + mov r11, QWORD PTR [r8+16] + mov r12, QWORD PTR [r8+24] + mov r13, QWORD PTR [rsp] + mov r14, QWORD PTR [rsp+8] + mov r15, QWORD PTR [rsp+16] + mov rdi, QWORD PTR [rsp+24] + xor rcx, QWORD PTR [rsp+64] + xor r10, QWORD PTR [rsp+72] + xor r11, QWORD PTR [rsp+80] + xor r12, QWORD PTR [rsp+88] + xor r13, QWORD PTR [rsp+32] + xor r14, QWORD PTR [rsp+40] + xor r15, QWORD PTR [rsp+48] + xor rdi, QWORD PTR [rsp+56] + and rcx, rsi + and r10, rsi + and r11, rsi + and r12, rsi + and r13, rsi + and r14, rsi + and r15, rsi + and rdi, rsi + xor QWORD PTR [r8], rcx + xor QWORD PTR [r8+8], r10 + xor QWORD PTR [r8+16], r11 + xor QWORD PTR [r8+24], r12 + xor QWORD PTR [rsp], r13 + xor QWORD PTR [rsp+8], r14 + xor QWORD PTR [rsp+16], r15 + xor QWORD PTR [rsp+24], rdi + xor QWORD PTR [rsp+64], rcx + xor QWORD PTR [rsp+72], r10 + xor QWORD PTR [rsp+80], r11 + xor QWORD PTR [rsp+88], r12 + xor QWORD PTR [rsp+32], r13 + xor QWORD PTR [rsp+40], r14 + xor QWORD PTR [rsp+48], r15 + xor QWORD PTR [rsp+56], rdi + mov rsi, rbx + ; Add-Sub + ; Add + mov rcx, QWORD PTR [r8] + mov r10, QWORD PTR [r8+8] + mov r11, QWORD PTR [r8+16] + mov r12, QWORD PTR [r8+24] + mov r13, rcx + add rcx, QWORD PTR [rsp] + mov r14, r10 + adc r10, QWORD PTR [rsp+8] + mov r15, r11 + adc r11, QWORD PTR [rsp+16] + mov rdi, r12 + adc r12, QWORD PTR [rsp+24] + mov rbx, 0 + adc rbx, 0 + shld rbx, r12, 1 + imul rbx, 19 + btr r12, 63 + ; Sub modulus (if overflow) + add rcx, rbx + adc r10, 0 + adc r11, 0 + adc r12, 0 + ; Sub + sub r13, QWORD PTR [rsp] + sbb r14, QWORD PTR [rsp+8] + sbb r15, QWORD PTR [rsp+16] + sbb rdi, QWORD PTR [rsp+24] + sbb rbx, rbx + shld rbx, rdi, 1 + imul rbx, -19 + btr rdi, 63 + ; Add modulus (if underflow) + sub r13, rbx + sbb r14, 0 + sbb r15, 0 + sbb rdi, 0 + mov QWORD PTR [r8], rcx + mov QWORD PTR [r8+8], r10 + mov QWORD PTR [r8+16], r11 + mov QWORD PTR [r8+24], r12 + mov QWORD PTR [rsp+128], r13 + mov QWORD PTR [rsp+136], r14 + mov QWORD PTR [rsp+144], r15 + mov QWORD PTR [rsp+152], rdi + ; Add-Sub + ; Add + mov rcx, QWORD PTR [rsp+64] + mov r10, QWORD PTR [rsp+72] + mov r11, QWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r13, rcx + add rcx, QWORD PTR [rsp+32] + mov r14, r10 + adc r10, QWORD PTR [rsp+40] + mov r15, r11 + adc r11, QWORD PTR [rsp+48] + mov rdi, r12 + adc r12, QWORD PTR [rsp+56] + mov rbx, 0 + adc rbx, 0 + shld rbx, r12, 1 + imul rbx, 19 + btr r12, 63 + ; Sub modulus (if overflow) + add rcx, rbx + adc r10, 0 + adc r11, 0 + adc r12, 0 + ; Sub + sub r13, QWORD PTR [rsp+32] + sbb r14, QWORD PTR [rsp+40] + sbb r15, QWORD PTR [rsp+48] + sbb rdi, QWORD PTR [rsp+56] + sbb rbx, rbx + shld rbx, rdi, 1 + imul rbx, -19 + btr rdi, 63 + ; Add modulus (if underflow) + sub r13, rbx + sbb r14, 0 + sbb r15, 0 + sbb rdi, 0 + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r10 + mov QWORD PTR [rsp+48], r11 + mov QWORD PTR [rsp+56], r12 + mov QWORD PTR [rsp+96], r13 + mov QWORD PTR [rsp+104], r14 + mov QWORD PTR [rsp+112], r15 + mov QWORD PTR [rsp+120], rdi + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+32] + mov rcx, rax + mov r10, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+32] + xor r11, r11 + add r10, rax + adc r11, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+40] + xor r12, r12 + add r10, rax + adc r11, rdx + adc r12, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+32] + add r11, rax + adc r12, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+40] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+48] + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+32] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+40] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+56] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+40] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+48] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+56] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+48] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+56] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+56] + add r15, rax + adc rdi, rdx + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r10 + mov QWORD PTR [rsp+48], r11 + mov QWORD PTR [rsp+56], r12 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [rsp+96] + mov rcx, rax + mov r10, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [rsp+96] + xor r11, r11 + add r10, rax + adc r11, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [rsp+104] + xor r12, r12 + add r10, rax + adc r11, rdx + adc r12, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [rsp+96] + add r11, rax + adc r12, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [rsp+104] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [rsp+112] + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [rsp+96] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [rsp+104] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [rsp+112] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r8] + mul QWORD PTR [rsp+120] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [rsp+104] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [rsp+112] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [rsp+120] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [rsp+112] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [rsp+120] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r8+24] + mul QWORD PTR [rsp+120] + add r15, rax + adc rdi, rdx + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r10 + mov QWORD PTR [rsp+16], r11 + mov QWORD PTR [rsp+24], r12 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+136] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+144] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+152] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+144] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+152] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+128] + mul rax + mov rcx, rax + mov rbx, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+136] + mul rax + add r10, rbx + adc r11, rax + adc rdx, 0 + mov rbx, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+144] + mul rax + add r12, rbx + adc r13, rax + adc rdx, 0 + mov rbx, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+152] + mul rax + add r15, rax + adc rdi, rdx + add r14, rbx + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r10 + mov QWORD PTR [rsp+112], r11 + mov QWORD PTR [rsp+120], r12 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+8] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+16] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+24] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+16] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+24] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + mov rcx, rax + mov rbx, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r10, rbx + adc r11, rax + adc rdx, 0 + mov rbx, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r12, rbx + adc r13, rax + adc rdx, 0 + mov rbx, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add r15, rax + adc rdi, rdx + add r14, rbx + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r10 + mov QWORD PTR [rsp+144], r11 + mov QWORD PTR [rsp+152], r12 + ; Add-Sub + ; Add + mov rcx, QWORD PTR [rsp] + mov r10, QWORD PTR [rsp+8] + mov r11, QWORD PTR [rsp+16] + mov r12, QWORD PTR [rsp+24] + mov r13, rcx + add rcx, QWORD PTR [rsp+32] + mov r14, r10 + adc r10, QWORD PTR [rsp+40] + mov r15, r11 + adc r11, QWORD PTR [rsp+48] + mov rdi, r12 + adc r12, QWORD PTR [rsp+56] + mov rbx, 0 + adc rbx, 0 + shld rbx, r12, 1 + imul rbx, 19 + btr r12, 63 + ; Sub modulus (if overflow) + add rcx, rbx + adc r10, 0 + adc r11, 0 + adc r12, 0 + ; Sub + sub r13, QWORD PTR [rsp+32] + sbb r14, QWORD PTR [rsp+40] + sbb r15, QWORD PTR [rsp+48] + sbb rdi, QWORD PTR [rsp+56] + sbb rbx, rbx + shld rbx, rdi, 1 + imul rbx, -19 + btr rdi, 63 + ; Add modulus (if underflow) + sub r13, rbx + sbb r14, 0 + sbb r15, 0 + sbb rdi, 0 + mov QWORD PTR [rsp+64], rcx + mov QWORD PTR [rsp+72], r10 + mov QWORD PTR [rsp+80], r11 + mov QWORD PTR [rsp+88], r12 + mov QWORD PTR [rsp+32], r13 + mov QWORD PTR [rsp+40], r14 + mov QWORD PTR [rsp+48], r15 + mov QWORD PTR [rsp+56], rdi + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r10, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r11, r11 + add r10, rax + adc r11, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r12, r12 + add r10, rax + adc r11, rdx + adc r12, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r11, rax + adc r12, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [r8], rcx + mov QWORD PTR [r8+8], r10 + mov QWORD PTR [r8+16], r11 + mov QWORD PTR [r8+24], r12 + ; Sub + mov rcx, QWORD PTR [rsp+128] + mov r10, QWORD PTR [rsp+136] + mov r11, QWORD PTR [rsp+144] + mov r12, QWORD PTR [rsp+152] + sub rcx, QWORD PTR [rsp+96] + sbb r10, QWORD PTR [rsp+104] + sbb r11, QWORD PTR [rsp+112] + sbb r12, QWORD PTR [rsp+120] + sbb rbx, rbx + shld rbx, r12, 1 + imul rbx, -19 + btr r12, 63 + ; Add modulus (if underflow) + sub rcx, rbx + sbb r10, 0 + sbb r11, 0 + sbb r12, 0 + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r10 + mov QWORD PTR [rsp+144], r11 + mov QWORD PTR [rsp+152], r12 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [rsp+40] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [rsp+48] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [rsp+56] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [rsp+48] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [rsp+56] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+48] + mul QWORD PTR [rsp+56] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+32] + mul rax + mov rcx, rax + mov rbx, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+40] + mul rax + add r10, rbx + adc r11, rax + adc rdx, 0 + mov rbx, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+48] + mul rax + add r12, rbx + adc r13, rax + adc rdx, 0 + mov rbx, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+56] + mul rax + add r15, rax + adc rdi, rdx + add r14, rbx + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r10 + mov QWORD PTR [rsp+48], r11 + mov QWORD PTR [rsp+56], r12 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+64] + mul QWORD PTR [rsp+72] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+64] + mul QWORD PTR [rsp+80] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+64] + mul QWORD PTR [rsp+88] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+72] + mul QWORD PTR [rsp+80] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+72] + mul QWORD PTR [rsp+88] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+80] + mul QWORD PTR [rsp+88] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+64] + mul rax + mov rcx, rax + mov rbx, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+72] + mul rax + add r10, rbx + adc r11, rax + adc rdx, 0 + mov rbx, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+80] + mul rax + add r12, rbx + adc r13, rax + adc rdx, 0 + mov rbx, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+88] + mul rax + add r15, rax + adc rdi, rdx + add r14, rbx + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp+64], rcx + mov QWORD PTR [rsp+72], r10 + mov QWORD PTR [rsp+80], r11 + mov QWORD PTR [rsp+88], r12 + ; Multiply by 121666 + mov rax, 121666 + mul QWORD PTR [rsp+128] + xor r11, r11 + mov rcx, rax + mov r10, rdx + mov rax, 121666 + mul QWORD PTR [rsp+136] + xor r12, r12 + add r10, rax + adc r11, rdx + mov rax, 121666 + mul QWORD PTR [rsp+144] + xor r14, r14 + add r11, rax + adc r12, rdx + mov rax, 121666 + mul QWORD PTR [rsp+152] + mov r13, 9223372036854775807 + add r12, rax + adc r14, rdx + add rcx, QWORD PTR [rsp+96] + adc r10, QWORD PTR [rsp+104] + adc r11, QWORD PTR [rsp+112] + adc r12, QWORD PTR [rsp+120] + adc r14, 0 + shld r14, r12, 1 + and r12, r13 + mov rax, 19 + mul r14 + add rcx, rax + adc r10, 0 + adc r11, 0 + adc r12, 0 + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r10 + mov QWORD PTR [rsp+112], r11 + mov QWORD PTR [rsp+120], r12 + ; Multiply by 9 + mov rax, 9 + mul QWORD PTR [rsp+32] + xor r11, r11 + mov rcx, rax + mov r10, rdx + mov rax, 9 + mul QWORD PTR [rsp+40] + xor r12, r12 + add r10, rax + adc r11, rdx + mov rax, 9 + mul QWORD PTR [rsp+48] + xor r14, r14 + add r11, rax + adc r12, rdx + mov rax, 9 + mul QWORD PTR [rsp+56] + mov r13, 9223372036854775807 + add r12, rax + adc r14, rdx + shld r14, r12, 1 + and r12, r13 + mov rax, 19 + mul r14 + add rcx, rax + adc r10, 0 + adc r11, 0 + adc r12, 0 + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r10 + mov QWORD PTR [rsp+48], r11 + mov QWORD PTR [rsp+56], r12 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r10, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r11, r11 + add r10, rax + adc r11, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r12, r12 + add r10, rax + adc r11, rdx + adc r12, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r11, rax + adc r12, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r10 + mov QWORD PTR [rsp+16], r11 + mov QWORD PTR [rsp+24], r12 + dec rbp + cmp rbp, 3 + jge L_curve25519_base_x64_bits + neg rsi + ; Conditional Swap + mov rcx, QWORD PTR [r8] + mov r10, QWORD PTR [r8+8] + mov r11, QWORD PTR [r8+16] + mov r12, QWORD PTR [r8+24] + mov r13, QWORD PTR [rsp] + mov r14, QWORD PTR [rsp+8] + mov r15, QWORD PTR [rsp+16] + mov rdi, QWORD PTR [rsp+24] + xor rcx, QWORD PTR [rsp+64] + xor r10, QWORD PTR [rsp+72] + xor r11, QWORD PTR [rsp+80] + xor r12, QWORD PTR [rsp+88] + xor r13, QWORD PTR [rsp+32] + xor r14, QWORD PTR [rsp+40] + xor r15, QWORD PTR [rsp+48] + xor rdi, QWORD PTR [rsp+56] + and rcx, rsi + and r10, rsi + and r11, rsi + and r12, rsi + and r13, rsi + and r14, rsi + and r15, rsi + and rdi, rsi + xor QWORD PTR [r8], rcx + xor QWORD PTR [r8+8], r10 + xor QWORD PTR [r8+16], r11 + xor QWORD PTR [r8+24], r12 + xor QWORD PTR [rsp], r13 + xor QWORD PTR [rsp+8], r14 + xor QWORD PTR [rsp+16], r15 + xor QWORD PTR [rsp+24], rdi + xor QWORD PTR [rsp+64], rcx + xor QWORD PTR [rsp+72], r10 + xor QWORD PTR [rsp+80], r11 + xor QWORD PTR [rsp+88], r12 + xor QWORD PTR [rsp+32], r13 + xor QWORD PTR [rsp+40], r14 + xor QWORD PTR [rsp+48], r15 + xor QWORD PTR [rsp+56], rdi +L_curve25519_base_x64_3: + ; Add-Sub + ; Add + mov rcx, QWORD PTR [r8] + mov r10, QWORD PTR [r8+8] + mov r11, QWORD PTR [r8+16] + mov r12, QWORD PTR [r8+24] + mov r13, rcx + add rcx, QWORD PTR [rsp] + mov r14, r10 + adc r10, QWORD PTR [rsp+8] + mov r15, r11 + adc r11, QWORD PTR [rsp+16] + mov rdi, r12 + adc r12, QWORD PTR [rsp+24] + mov rbx, 0 + adc rbx, 0 + shld rbx, r12, 1 + imul rbx, 19 + btr r12, 63 + ; Sub modulus (if overflow) + add rcx, rbx + adc r10, 0 + adc r11, 0 + adc r12, 0 + ; Sub + sub r13, QWORD PTR [rsp] + sbb r14, QWORD PTR [rsp+8] + sbb r15, QWORD PTR [rsp+16] + sbb rdi, QWORD PTR [rsp+24] + sbb rbx, rbx + shld rbx, rdi, 1 + imul rbx, -19 + btr rdi, 63 + ; Add modulus (if underflow) + sub r13, rbx + sbb r14, 0 + sbb r15, 0 + sbb rdi, 0 + mov QWORD PTR [r8], rcx + mov QWORD PTR [r8+8], r10 + mov QWORD PTR [r8+16], r11 + mov QWORD PTR [r8+24], r12 + mov QWORD PTR [rsp+128], r13 + mov QWORD PTR [rsp+136], r14 + mov QWORD PTR [rsp+144], r15 + mov QWORD PTR [rsp+152], rdi + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+136] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+144] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+152] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+144] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+152] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+128] + mul rax + mov rcx, rax + mov rbx, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+136] + mul rax + add r10, rbx + adc r11, rax + adc rdx, 0 + mov rbx, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+144] + mul rax + add r12, rbx + adc r13, rax + adc rdx, 0 + mov rbx, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+152] + mul rax + add r15, rax + adc rdi, rdx + add r14, rbx + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r10 + mov QWORD PTR [rsp+112], r11 + mov QWORD PTR [rsp+120], r12 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+8] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+16] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+24] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+16] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+24] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + mov rcx, rax + mov rbx, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r10, rbx + adc r11, rax + adc rdx, 0 + mov rbx, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r12, rbx + adc r13, rax + adc rdx, 0 + mov rbx, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add r15, rax + adc rdi, rdx + add r14, rbx + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r10 + mov QWORD PTR [rsp+144], r11 + mov QWORD PTR [rsp+152], r12 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r10, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r11, r11 + add r10, rax + adc r11, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r12, r12 + add r10, rax + adc r11, rdx + adc r12, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r11, rax + adc r12, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [r8], rcx + mov QWORD PTR [r8+8], r10 + mov QWORD PTR [r8+16], r11 + mov QWORD PTR [r8+24], r12 + ; Sub + mov rcx, QWORD PTR [rsp+128] + mov r10, QWORD PTR [rsp+136] + mov r11, QWORD PTR [rsp+144] + mov r12, QWORD PTR [rsp+152] + sub rcx, QWORD PTR [rsp+96] + sbb r10, QWORD PTR [rsp+104] + sbb r11, QWORD PTR [rsp+112] + sbb r12, QWORD PTR [rsp+120] + sbb rbx, rbx + shld rbx, r12, 1 + imul rbx, -19 + btr r12, 63 + ; Add modulus (if underflow) + sub rcx, rbx + sbb r10, 0 + sbb r11, 0 + sbb r12, 0 + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r10 + mov QWORD PTR [rsp+144], r11 + mov QWORD PTR [rsp+152], r12 + ; Multiply by 121666 + mov rax, 121666 + mul QWORD PTR [rsp+128] + xor r11, r11 + mov rcx, rax + mov r10, rdx + mov rax, 121666 + mul QWORD PTR [rsp+136] + xor r12, r12 + add r10, rax + adc r11, rdx + mov rax, 121666 + mul QWORD PTR [rsp+144] + xor r14, r14 + add r11, rax + adc r12, rdx + mov rax, 121666 + mul QWORD PTR [rsp+152] + mov r13, 9223372036854775807 + add r12, rax + adc r14, rdx + add rcx, QWORD PTR [rsp+96] + adc r10, QWORD PTR [rsp+104] + adc r11, QWORD PTR [rsp+112] + adc r12, QWORD PTR [rsp+120] + adc r14, 0 + shld r14, r12, 1 + and r12, r13 + mov rax, 19 + mul r14 + add rcx, rax + adc r10, 0 + adc r11, 0 + adc r12, 0 + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r10 + mov QWORD PTR [rsp+112], r11 + mov QWORD PTR [rsp+120], r12 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r10, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r11, r11 + add r10, rax + adc r11, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r12, r12 + add r10, rax + adc r11, rdx + adc r12, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r11, rax + adc r12, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + ; Store + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r10 + mov QWORD PTR [rsp+16], r11 + mov QWORD PTR [rsp+24], r12 + dec rbp + jge L_curve25519_base_x64_3 + ; Invert + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + mov rdx, rsp + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+96] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 4 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 19 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 99 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 4 + call fe_sq_n_x64 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + mov r8, QWORD PTR [rsp+160] + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r8] + mov rcx, rax + mov r10, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r8] + xor r11, r11 + add r10, rax + adc r11, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r8+8] + xor r12, r12 + add r10, rax + adc r11, rdx + adc r12, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r8] + add r11, rax + adc r12, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r8+8] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r8+16] + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r8+8] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r8+24] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r8+24] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r8+16] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rbx, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rbx + mov rbx, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add rcx, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add rcx, rbx + adc r10, r13 + adc r11, r14 + adc r12, r15 + mov rbx, 9223372036854775807 + mov rax, r12 + sar rax, 63 + and rax, 19 + and r12, rbx + add rcx, rax + adc r10, 0 + adc r11, 0 + adc r12, 0 + mov rax, 9223372036854775807 + mov rdx, rcx + add rdx, 19 + mov rdx, r10 + adc rdx, 0 + mov rdx, r11 + adc rdx, 0 + mov rdx, r12 + adc rdx, 0 + sar rdx, 63 + and rdx, 19 + and r12, rax + add rcx, rdx + adc r10, 0 + adc r11, 0 + adc r12, 0 + and r12, rax + ; Store + mov QWORD PTR [r8], rcx + mov QWORD PTR [r8+8], r10 + mov QWORD PTR [r8+16], r11 + mov QWORD PTR [r8+24], r12 + xor rax, rax + add rsp, 168 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +curve25519_base_x64 ENDP +_TEXT ENDS +ENDIF +_TEXT SEGMENT READONLY PARA +curve25519_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r9, rcx + mov r10, rdx + sub rsp, 176 + xor rbx, rbx + mov QWORD PTR [rsp+168], r9 + ; Set one + mov QWORD PTR [r9], 1 + mov QWORD PTR [r9+8], 0 + mov QWORD PTR [r9+16], 0 + mov QWORD PTR [r9+24], 0 + ; Set zero + mov QWORD PTR [rsp], 0 + mov QWORD PTR [rsp+8], 0 + mov QWORD PTR [rsp+16], 0 + mov QWORD PTR [rsp+24], 0 + ; Set one + mov QWORD PTR [rsp+32], 1 + mov QWORD PTR [rsp+40], 0 + mov QWORD PTR [rsp+48], 0 + mov QWORD PTR [rsp+56], 0 + ; Copy + mov rcx, QWORD PTR [r8] + mov r11, QWORD PTR [r8+8] + mov r12, QWORD PTR [r8+16] + mov r13, QWORD PTR [r8+24] + mov QWORD PTR [rsp+64], rcx + mov QWORD PTR [rsp+72], r11 + mov QWORD PTR [rsp+80], r12 + mov QWORD PTR [rsp+88], r13 + mov r11, 254 +L_curve25519_x64_bits: + mov QWORD PTR [rsp+160], r11 + mov rcx, r11 + and rcx, 63 + shr r11, 6 + mov rbp, QWORD PTR [r10+8*r11] + shr rbp, cl + and rbp, 1 + xor rbx, rbp + neg rbx + ; Conditional Swap + mov rcx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov r12, QWORD PTR [r9+16] + mov r13, QWORD PTR [r9+24] + mov r14, QWORD PTR [rsp] + mov r15, QWORD PTR [rsp+8] + mov rdi, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + xor rcx, QWORD PTR [rsp+64] + xor r11, QWORD PTR [rsp+72] + xor r12, QWORD PTR [rsp+80] + xor r13, QWORD PTR [rsp+88] + xor r14, QWORD PTR [rsp+32] + xor r15, QWORD PTR [rsp+40] + xor rdi, QWORD PTR [rsp+48] + xor rsi, QWORD PTR [rsp+56] + and rcx, rbx + and r11, rbx + and r12, rbx + and r13, rbx + and r14, rbx + and r15, rbx + and rdi, rbx + and rsi, rbx + xor QWORD PTR [r9], rcx + xor QWORD PTR [r9+8], r11 + xor QWORD PTR [r9+16], r12 + xor QWORD PTR [r9+24], r13 + xor QWORD PTR [rsp], r14 + xor QWORD PTR [rsp+8], r15 + xor QWORD PTR [rsp+16], rdi + xor QWORD PTR [rsp+24], rsi + xor QWORD PTR [rsp+64], rcx + xor QWORD PTR [rsp+72], r11 + xor QWORD PTR [rsp+80], r12 + xor QWORD PTR [rsp+88], r13 + xor QWORD PTR [rsp+32], r14 + xor QWORD PTR [rsp+40], r15 + xor QWORD PTR [rsp+48], rdi + xor QWORD PTR [rsp+56], rsi + mov rbx, rbp + ; Add-Sub + ; Add + mov rcx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov r12, QWORD PTR [r9+16] + mov r13, QWORD PTR [r9+24] + mov r14, rcx + add rcx, QWORD PTR [rsp] + mov r15, r11 + adc r11, QWORD PTR [rsp+8] + mov rdi, r12 + adc r12, QWORD PTR [rsp+16] + mov rsi, r13 + adc r13, QWORD PTR [rsp+24] + mov rbp, 0 + adc rbp, 0 + shld rbp, r13, 1 + imul rbp, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add rcx, rbp + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp] + sbb r15, QWORD PTR [rsp+8] + sbb rdi, QWORD PTR [rsp+16] + sbb rsi, QWORD PTR [rsp+24] + sbb rbp, rbp + shld rbp, rsi, 1 + imul rbp, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbp + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [r9], rcx + mov QWORD PTR [r9+8], r11 + mov QWORD PTR [r9+16], r12 + mov QWORD PTR [r9+24], r13 + mov QWORD PTR [rsp+128], r14 + mov QWORD PTR [rsp+136], r15 + mov QWORD PTR [rsp+144], rdi + mov QWORD PTR [rsp+152], rsi + ; Add-Sub + ; Add + mov rcx, QWORD PTR [rsp+64] + mov r11, QWORD PTR [rsp+72] + mov r12, QWORD PTR [rsp+80] + mov r13, QWORD PTR [rsp+88] + mov r14, rcx + add rcx, QWORD PTR [rsp+32] + mov r15, r11 + adc r11, QWORD PTR [rsp+40] + mov rdi, r12 + adc r12, QWORD PTR [rsp+48] + mov rsi, r13 + adc r13, QWORD PTR [rsp+56] + mov rbp, 0 + adc rbp, 0 + shld rbp, r13, 1 + imul rbp, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add rcx, rbp + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp+32] + sbb r15, QWORD PTR [rsp+40] + sbb rdi, QWORD PTR [rsp+48] + sbb rsi, QWORD PTR [rsp+56] + sbb rbp, rbp + shld rbp, rsi, 1 + imul rbp, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbp + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + mov QWORD PTR [rsp+96], r14 + mov QWORD PTR [rsp+104], r15 + mov QWORD PTR [rsp+112], rdi + mov QWORD PTR [rsp+120], rsi + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+32] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+32] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+40] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+32] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+40] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+48] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+32] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+40] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+48] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+56] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+40] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+48] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+56] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+48] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+56] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+152] + mul QWORD PTR [rsp+56] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rsp+96] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rsp+96] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rsp+104] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rsp+96] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rsp+104] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rsp+112] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rsp+96] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rsp+104] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rsp+112] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rsp+120] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rsp+104] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rsp+112] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rsp+120] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rsp+112] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rsp+120] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rsp+120] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r11 + mov QWORD PTR [rsp+16], r12 + mov QWORD PTR [rsp+24], r13 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+136] + mov r11, rax + mov r12, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+144] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+152] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+144] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+152] + xor rdi, rdi + add r15, rax + adc rdi, rdx + ; Double + xor rsi, rsi + add r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+128] + mul rax + mov rcx, rax + mov rbp, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+136] + mul rax + add r11, rbp + adc r12, rax + adc rdx, 0 + mov rbp, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+144] + mul rax + add r13, rbp + adc r14, rax + adc rdx, 0 + mov rbp, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+152] + mul rax + add rdi, rax + adc rsi, rdx + add r15, rbp + adc rdi, 0 + adc rsi, 0 + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+8] + mov r11, rax + mov r12, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+16] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+24] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+16] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+24] + add r14, rax + adc r15, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r9+24] + xor rdi, rdi + add r15, rax + adc rdi, rdx + ; Double + xor rsi, rsi + add r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r9] + mul rax + mov rcx, rax + mov rbp, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r9+8] + mul rax + add r11, rbp + adc r12, rax + adc rdx, 0 + mov rbp, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r9+16] + mul rax + add r13, rbp + adc r14, rax + adc rdx, 0 + mov rbp, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r9+24] + mul rax + add rdi, rax + adc rsi, rdx + add r15, rbp + adc rdi, 0 + adc rsi, 0 + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + ; Add-Sub + ; Add + mov rcx, QWORD PTR [rsp] + mov r11, QWORD PTR [rsp+8] + mov r12, QWORD PTR [rsp+16] + mov r13, QWORD PTR [rsp+24] + mov r14, rcx + add rcx, QWORD PTR [rsp+32] + mov r15, r11 + adc r11, QWORD PTR [rsp+40] + mov rdi, r12 + adc r12, QWORD PTR [rsp+48] + mov rsi, r13 + adc r13, QWORD PTR [rsp+56] + mov rbp, 0 + adc rbp, 0 + shld rbp, r13, 1 + imul rbp, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add rcx, rbp + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp+32] + sbb r15, QWORD PTR [rsp+40] + sbb rdi, QWORD PTR [rsp+48] + sbb rsi, QWORD PTR [rsp+56] + sbb rbp, rbp + shld rbp, rsi, 1 + imul rbp, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbp + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [rsp+64], rcx + mov QWORD PTR [rsp+72], r11 + mov QWORD PTR [rsp+80], r12 + mov QWORD PTR [rsp+88], r13 + mov QWORD PTR [rsp+32], r14 + mov QWORD PTR [rsp+40], r15 + mov QWORD PTR [rsp+48], rdi + mov QWORD PTR [rsp+56], rsi + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [r9], rcx + mov QWORD PTR [r9+8], r11 + mov QWORD PTR [r9+16], r12 + mov QWORD PTR [r9+24], r13 + ; Sub + mov rcx, QWORD PTR [rsp+128] + mov r11, QWORD PTR [rsp+136] + mov r12, QWORD PTR [rsp+144] + mov r13, QWORD PTR [rsp+152] + sub rcx, QWORD PTR [rsp+96] + sbb r11, QWORD PTR [rsp+104] + sbb r12, QWORD PTR [rsp+112] + sbb r13, QWORD PTR [rsp+120] + sbb rbp, rbp + shld rbp, r13, 1 + imul rbp, -19 + btr r13, 63 + ; Add modulus (if underflow) + sub rcx, rbp + sbb r11, 0 + sbb r12, 0 + sbb r13, 0 + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [rsp+40] + mov r11, rax + mov r12, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [rsp+48] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [rsp+56] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [rsp+48] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [rsp+56] + add r14, rax + adc r15, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+48] + mul QWORD PTR [rsp+56] + xor rdi, rdi + add r15, rax + adc rdi, rdx + ; Double + xor rsi, rsi + add r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+32] + mul rax + mov rcx, rax + mov rbp, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+40] + mul rax + add r11, rbp + adc r12, rax + adc rdx, 0 + mov rbp, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+48] + mul rax + add r13, rbp + adc r14, rax + adc rdx, 0 + mov rbp, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+56] + mul rax + add rdi, rax + adc rsi, rdx + add r15, rbp + adc rdi, 0 + adc rsi, 0 + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+64] + mul QWORD PTR [rsp+72] + mov r11, rax + mov r12, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+64] + mul QWORD PTR [rsp+80] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+64] + mul QWORD PTR [rsp+88] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+72] + mul QWORD PTR [rsp+80] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+72] + mul QWORD PTR [rsp+88] + add r14, rax + adc r15, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+80] + mul QWORD PTR [rsp+88] + xor rdi, rdi + add r15, rax + adc rdi, rdx + ; Double + xor rsi, rsi + add r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+64] + mul rax + mov rcx, rax + mov rbp, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+72] + mul rax + add r11, rbp + adc r12, rax + adc rdx, 0 + mov rbp, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+80] + mul rax + add r13, rbp + adc r14, rax + adc rdx, 0 + mov rbp, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+88] + mul rax + add rdi, rax + adc rsi, rdx + add r15, rbp + adc rdi, 0 + adc rsi, 0 + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+64], rcx + mov QWORD PTR [rsp+72], r11 + mov QWORD PTR [rsp+80], r12 + mov QWORD PTR [rsp+88], r13 + ; Multiply by 121666 + mov rax, 121666 + mul QWORD PTR [rsp+128] + xor r12, r12 + mov rcx, rax + mov r11, rdx + mov rax, 121666 + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + mov rax, 121666 + mul QWORD PTR [rsp+144] + xor r15, r15 + add r12, rax + adc r13, rdx + mov rax, 121666 + mul QWORD PTR [rsp+152] + mov r14, 9223372036854775807 + add r13, rax + adc r15, rdx + add rcx, QWORD PTR [rsp+96] + adc r11, QWORD PTR [rsp+104] + adc r12, QWORD PTR [rsp+112] + adc r13, QWORD PTR [rsp+120] + adc r15, 0 + shld r15, r13, 1 + and r13, r14 + mov rax, 19 + mul r15 + add rcx, rax + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [r8] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [r8] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [r8+8] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+48] + mul QWORD PTR [r8] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+56] + mul QWORD PTR [r8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+48] + mul QWORD PTR [r8+8] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+32] + mul QWORD PTR [r8+24] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+56] + mul QWORD PTR [r8+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+48] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+40] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+56] + mul QWORD PTR [r8+16] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+48] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+56] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+32], rcx + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r11 + mov QWORD PTR [rsp+16], r12 + mov QWORD PTR [rsp+24], r13 + mov r11, QWORD PTR [rsp+160] + dec r11 + cmp r11, 3 + jge L_curve25519_x64_bits + mov QWORD PTR [rsp+160], 2 + neg rbx + ; Conditional Swap + mov rcx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov r12, QWORD PTR [r9+16] + mov r13, QWORD PTR [r9+24] + mov r14, QWORD PTR [rsp] + mov r15, QWORD PTR [rsp+8] + mov rdi, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + xor rcx, QWORD PTR [rsp+64] + xor r11, QWORD PTR [rsp+72] + xor r12, QWORD PTR [rsp+80] + xor r13, QWORD PTR [rsp+88] + xor r14, QWORD PTR [rsp+32] + xor r15, QWORD PTR [rsp+40] + xor rdi, QWORD PTR [rsp+48] + xor rsi, QWORD PTR [rsp+56] + and rcx, rbx + and r11, rbx + and r12, rbx + and r13, rbx + and r14, rbx + and r15, rbx + and rdi, rbx + and rsi, rbx + xor QWORD PTR [r9], rcx + xor QWORD PTR [r9+8], r11 + xor QWORD PTR [r9+16], r12 + xor QWORD PTR [r9+24], r13 + xor QWORD PTR [rsp], r14 + xor QWORD PTR [rsp+8], r15 + xor QWORD PTR [rsp+16], rdi + xor QWORD PTR [rsp+24], rsi + xor QWORD PTR [rsp+64], rcx + xor QWORD PTR [rsp+72], r11 + xor QWORD PTR [rsp+80], r12 + xor QWORD PTR [rsp+88], r13 + xor QWORD PTR [rsp+32], r14 + xor QWORD PTR [rsp+40], r15 + xor QWORD PTR [rsp+48], rdi + xor QWORD PTR [rsp+56], rsi +L_curve25519_x64_3: + ; Add-Sub + ; Add + mov rcx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov r12, QWORD PTR [r9+16] + mov r13, QWORD PTR [r9+24] + mov r14, rcx + add rcx, QWORD PTR [rsp] + mov r15, r11 + adc r11, QWORD PTR [rsp+8] + mov rdi, r12 + adc r12, QWORD PTR [rsp+16] + mov rsi, r13 + adc r13, QWORD PTR [rsp+24] + mov rbp, 0 + adc rbp, 0 + shld rbp, r13, 1 + imul rbp, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add rcx, rbp + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp] + sbb r15, QWORD PTR [rsp+8] + sbb rdi, QWORD PTR [rsp+16] + sbb rsi, QWORD PTR [rsp+24] + sbb rbp, rbp + shld rbp, rsi, 1 + imul rbp, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbp + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [r9], rcx + mov QWORD PTR [r9+8], r11 + mov QWORD PTR [r9+16], r12 + mov QWORD PTR [r9+24], r13 + mov QWORD PTR [rsp+128], r14 + mov QWORD PTR [rsp+136], r15 + mov QWORD PTR [rsp+144], rdi + mov QWORD PTR [rsp+152], rsi + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+136] + mov r11, rax + mov r12, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+144] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rsp+128] + mul QWORD PTR [rsp+152] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+144] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rsp+136] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rsp+144] + mul QWORD PTR [rsp+152] + xor rdi, rdi + add r15, rax + adc rdi, rdx + ; Double + xor rsi, rsi + add r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rsp+128] + mul rax + mov rcx, rax + mov rbp, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rsp+136] + mul rax + add r11, rbp + adc r12, rax + adc rdx, 0 + mov rbp, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rsp+144] + mul rax + add r13, rbp + adc r14, rax + adc rdx, 0 + mov rbp, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rsp+152] + mul rax + add rdi, rax + adc rsi, rdx + add r15, rbp + adc rdi, 0 + adc rsi, 0 + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+8] + mov r11, rax + mov r12, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+16] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+24] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+16] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+24] + add r14, rax + adc r15, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r9+24] + xor rdi, rdi + add r15, rax + adc rdi, rdx + ; Double + xor rsi, rsi + add r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r9] + mul rax + mov rcx, rax + mov rbp, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r9+8] + mul rax + add r11, rbp + adc r12, rax + adc rdx, 0 + mov rbp, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r9+16] + mul rax + add r13, rbp + adc r14, rax + adc rdx, 0 + mov rbp, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r9+24] + mul rax + add rdi, rax + adc rsi, rdx + add r15, rbp + adc rdi, 0 + adc rsi, 0 + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [r9], rcx + mov QWORD PTR [r9+8], r11 + mov QWORD PTR [r9+16], r12 + mov QWORD PTR [r9+24], r13 + ; Sub + mov rcx, QWORD PTR [rsp+128] + mov r11, QWORD PTR [rsp+136] + mov r12, QWORD PTR [rsp+144] + mov r13, QWORD PTR [rsp+152] + sub rcx, QWORD PTR [rsp+96] + sbb r11, QWORD PTR [rsp+104] + sbb r12, QWORD PTR [rsp+112] + sbb r13, QWORD PTR [rsp+120] + sbb rbp, rbp + shld rbp, r13, 1 + imul rbp, -19 + btr r13, 63 + ; Add modulus (if underflow) + sub rcx, rbp + sbb r11, 0 + sbb r12, 0 + sbb r13, 0 + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + ; Multiply by 121666 + mov rax, 121666 + mul QWORD PTR [rsp+128] + xor r12, r12 + mov rcx, rax + mov r11, rdx + mov rax, 121666 + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + mov rax, 121666 + mul QWORD PTR [rsp+144] + xor r15, r15 + add r12, rax + adc r13, rdx + mov rax, 121666 + mul QWORD PTR [rsp+152] + mov r14, 9223372036854775807 + add r13, rax + adc r15, rdx + add rcx, QWORD PTR [rsp+96] + adc r11, QWORD PTR [rsp+104] + adc r12, QWORD PTR [rsp+112] + adc r13, QWORD PTR [rsp+120] + adc r15, 0 + shld r15, r13, 1 + and r13, r14 + mov rax, 19 + mul r15 + add rcx, rax + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+128] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+128] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+136] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+128] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+136] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+144] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+128] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+136] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+144] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp+96] + mul QWORD PTR [rsp+152] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+136] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+144] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+104] + mul QWORD PTR [rsp+152] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+144] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+112] + mul QWORD PTR [rsp+152] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+120] + mul QWORD PTR [rsp+152] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + ; Store + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r11 + mov QWORD PTR [rsp+16], r12 + mov QWORD PTR [rsp+24], r13 + dec QWORD PTR [rsp+160] + jge L_curve25519_x64_3 + ; Invert + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + mov rdx, rsp + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+96] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 4 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 19 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 99 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 4 + call fe_sq_n_x64 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + mov r9, QWORD PTR [rsp+168] + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r9] + mov rcx, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r9] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r9+8] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r9] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r9+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r9+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r9] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r9+8] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r9+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rsp] + mul QWORD PTR [r9+24] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r9+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r9+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rsp+8] + mul QWORD PTR [r9+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r9+16] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rsp+16] + mul QWORD PTR [r9+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rsp+24] + mul QWORD PTR [r9+24] + add rdi, rax + adc rsi, rdx + mov rax, 38 + mul rsi + add r13, rax + adc rdx, 0 + mov rbp, 9223372036854775807 + shld rdx, r13, 1 + imul rdx, rdx, 19 + and r13, rbp + mov rbp, rdx + mov rax, 38 + mul r14 + xor r14, r14 + add rcx, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + adc rdi, rdx + add rcx, rbp + adc r11, r14 + adc r12, r15 + adc r13, rdi + mov rbp, 9223372036854775807 + mov rax, r13 + sar rax, 63 + and rax, 19 + and r13, rbp + add rcx, rax + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov rax, 9223372036854775807 + mov rdx, rcx + add rdx, 19 + mov rdx, r11 + adc rdx, 0 + mov rdx, r12 + adc rdx, 0 + mov rdx, r13 + adc rdx, 0 + sar rdx, 63 + and rdx, 19 + and r13, rax + add rcx, rdx + adc r11, 0 + adc r12, 0 + adc r13, 0 + and r13, rax + ; Store + mov QWORD PTR [r9], rcx + mov QWORD PTR [r9+8], r11 + mov QWORD PTR [r9+16], r12 + mov QWORD PTR [r9+24], r13 + xor rax, rax + add rsp, 176 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +curve25519_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_pow22523_x64 PROC + sub rsp, 112 + ; pow22523 + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], rdx + mov rcx, rsp + mov rdx, QWORD PTR [rsp+104] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + mov rdx, QWORD PTR [rsp+104] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + mov rcx, rsp + mov rdx, rsp + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + mov rcx, rsp + mov rdx, rsp + call fe_sq_x64 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 4 + call fe_sq_n_x64 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 9 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 19 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 9 + call fe_sq_n_x64 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 49 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 99 + call fe_sq_n_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_x64 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 49 + call fe_sq_n_x64 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_x64 + mov rcx, rsp + mov rdx, rsp + call fe_sq_x64 + mov rcx, rsp + mov rdx, rsp + call fe_sq_x64 + mov rcx, QWORD PTR [rsp+96] + mov rdx, rsp + mov r8, QWORD PTR [rsp+104] + call fe_mul_x64 + mov rdx, QWORD PTR [rsp+104] + mov rcx, QWORD PTR [rsp+96] + add rsp, 112 + ret +fe_pow22523_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p1p1_to_p2_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r8, rdx + sub rsp, 16 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r8 + mov r9, r8 + add r9, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r11, rax + mov r12, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r13, rax + adc r14, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + add r8, 64 + add rcx, 64 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r11, rax + mov r12, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r13, rax + adc r14, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + mov r9, r8 + sub r9, 32 + sub rcx, 32 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r11, rax + mov r12, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r13, rax + adc r14, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + add rsp, 16 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_p1p1_to_p2_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p1p1_to_p3_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r8, rdx + sub rsp, 16 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r8 + mov r9, r8 + add r9, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r11, rax + mov r12, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r13, rax + adc r14, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + mov r9, r8 + add r9, 32 + add rcx, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r11, rax + mov r12, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r13, rax + adc r14, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + add r8, 64 + sub rcx, 64 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r11, rax + mov r12, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r13, rax + adc r14, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + mov r9, r8 + add r9, 32 + add rcx, 32 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r11, rax + mov r12, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r13, rax + adc r14, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + add rsp, 16 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_p1p1_to_p3_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p2_dbl_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r8, rdx + sub rsp, 16 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r8 + add rcx, 64 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+8] + mov r12, rax + mov r13, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+16] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+16] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+24] + xor rsi, rsi + add rdi, rax + adc rsi, rdx + ; Double + xor rbx, rbx + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, rsi + adc rbx, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + mov r11, rax + mov r10, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r12, r10 + adc r13, rax + adc rdx, 0 + mov r10, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r14, r10 + adc r15, rax + adc rdx, 0 + mov r10, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add rsi, rax + adc rbx, rdx + add rdi, r10 + adc rsi, 0 + adc rbx, 0 + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + add r8, 32 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+8] + mov r12, rax + mov r13, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+16] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+16] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+24] + xor rsi, rsi + add rdi, rax + adc rsi, rdx + ; Double + xor rbx, rbx + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, rsi + adc rbx, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + mov r11, rax + mov r10, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r12, r10 + adc r13, rax + adc rdx, 0 + mov r10, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r14, r10 + adc r15, rax + adc rdx, 0 + mov r10, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add rsi, rax + adc rbx, rdx + add rdi, r10 + adc rsi, 0 + adc rbx, 0 + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov r8, rcx + sub rcx, 32 + ; Add-Sub + ; Add + mov r15, r11 + add r11, QWORD PTR [r8] + mov rdi, r12 + adc r12, QWORD PTR [r8+8] + mov rsi, r13 + adc r13, QWORD PTR [r8+16] + mov rbx, r14 + adc r14, QWORD PTR [r8+24] + mov r10, 0 + adc r10, 0 + shld r10, r14, 1 + imul r10, 19 + btr r14, 63 + ; Sub modulus (if overflow) + add r11, r10 + adc r12, 0 + adc r13, 0 + adc r14, 0 + ; Sub + sub r15, QWORD PTR [r8] + sbb rdi, QWORD PTR [r8+8] + sbb rsi, QWORD PTR [r8+16] + sbb rbx, QWORD PTR [r8+24] + sbb r10, r10 + shld r10, rbx, 1 + imul r10, -19 + btr rbx, 63 + ; Add modulus (if underflow) + sub r15, r10 + sbb rdi, 0 + sbb rsi, 0 + sbb rbx, 0 + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + mov QWORD PTR [r8], r15 + mov QWORD PTR [r8+8], rdi + mov QWORD PTR [r8+16], rsi + mov QWORD PTR [r8+24], rbx + mov r9, QWORD PTR [rsp+8] + mov r8, r9 + add r8, 32 + sub rcx, 32 + ; Add + mov r11, QWORD PTR [r8] + mov r12, QWORD PTR [r8+8] + add r11, QWORD PTR [r9] + mov r13, QWORD PTR [r8+16] + adc r12, QWORD PTR [r9+8] + mov r14, QWORD PTR [r8+24] + adc r13, QWORD PTR [r9+16] + adc r14, QWORD PTR [r9+24] + mov r10, 0 + adc r10, 0 + shld r10, r14, 1 + imul r10, 19 + btr r14, 63 + ; Sub modulus (if overflow) + add r11, r10 + adc r12, 0 + adc r13, 0 + adc r14, 0 + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + ; Square + ; A[0] * A[1] + mov rax, QWORD PTR [rcx] + mul QWORD PTR [rcx+8] + mov r12, rax + mov r13, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [rcx] + mul QWORD PTR [rcx+16] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [rcx] + mul QWORD PTR [rcx+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [rcx+8] + mul QWORD PTR [rcx+16] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [rcx+8] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [rcx+16] + mul QWORD PTR [rcx+24] + xor rsi, rsi + add rdi, rax + adc rsi, rdx + ; Double + xor rbx, rbx + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, rsi + adc rbx, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [rcx] + mul rax + mov r11, rax + mov r10, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [rcx+8] + mul rax + add r12, r10 + adc r13, rax + adc rdx, 0 + mov r10, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [rcx+16] + mul rax + add r14, r10 + adc r15, rax + adc rdx, 0 + mov r10, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [rcx+24] + mul rax + add rsi, rax + adc rbx, rdx + add rdi, r10 + adc rsi, 0 + adc rbx, 0 + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + ; Store + mov r8, rcx + add r8, 32 + ; Sub + sub r11, QWORD PTR [r8] + sbb r12, QWORD PTR [r8+8] + sbb r13, QWORD PTR [r8+16] + sbb r14, QWORD PTR [r8+24] + sbb r10, r10 + shld r10, r14, 1 + imul r10, -19 + btr r14, 63 + ; Add modulus (if underflow) + sub r11, r10 + sbb r12, 0 + sbb r13, 0 + sbb r14, 0 + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + add r9, 64 + ; Square * 2 + ; A[0] * A[1] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+8] + mov r12, rax + mov r13, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+16] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r9+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+16] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r9+24] + add r15, rax + adc rdi, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r9+24] + xor rsi, rsi + add rdi, rax + adc rsi, rdx + ; Double + xor rbx, rbx + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, rdi + adc rsi, rsi + adc rbx, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r9] + mul rax + mov r11, rax + mov r10, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r9+8] + mul rax + add r12, r10 + adc r13, rax + adc rdx, 0 + mov r10, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r9+16] + mul rax + add r14, r10 + adc r15, rax + adc rdx, 0 + mov r10, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r9+24] + mul rax + add rsi, rax + adc rbx, rdx + add rdi, r10 + adc rsi, 0 + adc rbx, 0 + mov rax, 38 + mul rbx + add r14, rax + adc rdx, 0 + mov r10, 9223372036854775807 + shld rdx, r14, 1 + imul rdx, rdx, 19 + and r14, r10 + mov r10, rdx + mov rax, 38 + mul r15 + xor r15, r15 + add r11, rax + mov rax, 38 + adc r15, rdx + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + adc rsi, rdx + add r11, r10 + adc r12, r15 + adc r13, rdi + adc r14, rsi + mov rax, r14 + shld r14, r13, 1 + shld r13, r12, 1 + shld r12, r11, 1 + shl r11, 1 + mov r10, 9223372036854775807 + shr rax, 62 + and r14, r10 + imul rax, rax, 19 + add r11, rax + adc r12, 0 + adc r13, 0 + adc r14, 0 + ; Store + mov r8, rcx + add r8, 64 + add rcx, 96 + ; Sub + sub r11, QWORD PTR [r8] + sbb r12, QWORD PTR [r8+8] + sbb r13, QWORD PTR [r8+16] + sbb r14, QWORD PTR [r8+24] + sbb r10, r10 + shld r10, r14, 1 + imul r10, -19 + btr r14, 63 + ; Add modulus (if underflow) + sub r11, r10 + sbb r12, 0 + sbb r13, 0 + sbb r14, 0 + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + add rsp, 16 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_p2_dbl_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_madd_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r9, r8 + mov r8, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r8 + mov QWORD PTR [rsp+16], r9 + mov r10, r8 + mov r9, r8 + add r9, 32 + mov r8, rcx + add r8, 32 + ; Add-Sub + ; Add + mov r12, QWORD PTR [r9] + mov r13, QWORD PTR [r9+8] + mov r14, QWORD PTR [r9+16] + mov r15, QWORD PTR [r9+24] + mov rdi, r12 + add r12, QWORD PTR [r10] + mov rsi, r13 + adc r13, QWORD PTR [r10+8] + mov rbx, r14 + adc r14, QWORD PTR [r10+16] + mov rbp, r15 + adc r15, QWORD PTR [r10+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r10] + sbb rsi, QWORD PTR [r10+8] + sbb rbx, QWORD PTR [r10+16] + sbb rbp, QWORD PTR [r10+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [r8], rdi + mov QWORD PTR [r8+8], rsi + mov QWORD PTR [r8+16], rbx + mov QWORD PTR [r8+24], rbp + mov r9, QWORD PTR [rsp+16] + add r9, 32 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r8+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r8+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [r8], r12 + mov QWORD PTR [r8+8], r13 + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 + add r10, 96 + add r9, 32 + add rcx, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + sub r9, 64 + sub rcx, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [r8] + mov rsi, r13 + adc r13, QWORD PTR [r8+8] + mov rbx, r14 + adc r14, QWORD PTR [r8+16] + mov rbp, r15 + adc r15, QWORD PTR [r8+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r8] + sbb rsi, QWORD PTR [r8+8] + sbb rbx, QWORD PTR [r8+16] + sbb rbp, QWORD PTR [r8+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [r8], r12 + mov QWORD PTR [r8+8], r13 + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + sub r10, 32 + ; Double + mov r12, QWORD PTR [r10] + mov r13, QWORD PTR [r10+8] + add r12, r12 + mov r14, QWORD PTR [r10+16] + adc r13, r13 + mov r15, QWORD PTR [r10+24] + adc r14, r14 + adc r15, r15 + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + mov r8, rcx + add r8, 96 + add rcx, 64 + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [r8] + mov rsi, r13 + adc r13, QWORD PTR [r8+8] + mov rbx, r14 + adc r14, QWORD PTR [r8+16] + mov rbp, r15 + adc r15, QWORD PTR [r8+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r8] + sbb rsi, QWORD PTR [r8+8] + sbb rbx, QWORD PTR [r8+16] + sbb rbp, QWORD PTR [r8+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [r8], rdi + mov QWORD PTR [r8+8], rsi + mov QWORD PTR [r8+16], rbx + mov QWORD PTR [r8+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_madd_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_msub_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r9, r8 + mov r8, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r8 + mov QWORD PTR [rsp+16], r9 + mov r10, r8 + mov r9, r8 + add r9, 32 + mov r8, rcx + add r8, 32 + ; Add-Sub + ; Add + mov r12, QWORD PTR [r9] + mov r13, QWORD PTR [r9+8] + mov r14, QWORD PTR [r9+16] + mov r15, QWORD PTR [r9+24] + mov rdi, r12 + add r12, QWORD PTR [r10] + mov rsi, r13 + adc r13, QWORD PTR [r10+8] + mov rbx, r14 + adc r14, QWORD PTR [r10+16] + mov rbp, r15 + adc r15, QWORD PTR [r10+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r10] + sbb rsi, QWORD PTR [r10+8] + sbb rbx, QWORD PTR [r10+16] + sbb rbp, QWORD PTR [r10+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [r8], rdi + mov QWORD PTR [r8+8], rsi + mov QWORD PTR [r8+16], rbx + mov QWORD PTR [r8+24], rbp + mov r9, QWORD PTR [rsp+16] + add rcx, 32 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + add r10, 96 + add r9, 64 + add rcx, 64 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + sub r9, 32 + sub rcx, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [r8] + mov rsi, r13 + adc r13, QWORD PTR [r8+8] + mov rbx, r14 + adc r14, QWORD PTR [r8+16] + mov rbp, r15 + adc r15, QWORD PTR [r8+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r8] + sbb rsi, QWORD PTR [r8+8] + sbb rbx, QWORD PTR [r8+16] + sbb rbp, QWORD PTR [r8+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [r8], r12 + mov QWORD PTR [r8+8], r13 + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + sub r10, 32 + add rcx, 64 + ; Double + mov r12, QWORD PTR [r10] + mov r13, QWORD PTR [r10+8] + add r12, r12 + mov r14, QWORD PTR [r10+16] + adc r13, r13 + mov r15, QWORD PTR [r10+24] + adc r14, r14 + adc r15, r15 + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + mov r8, rcx + add r8, 32 + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [r8] + mov rsi, r13 + adc r13, QWORD PTR [r8+8] + mov rbx, r14 + adc r14, QWORD PTR [r8+16] + mov rbp, r15 + adc r15, QWORD PTR [r8+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r8] + sbb rsi, QWORD PTR [r8+8] + sbb rbx, QWORD PTR [r8+16] + sbb rbp, QWORD PTR [r8+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [r8], r12 + mov QWORD PTR [r8+8], r13 + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_msub_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_add_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r9, r8 + mov r8, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r8 + mov QWORD PTR [rsp+16], r9 + mov r10, r8 + mov r9, r8 + add r9, 32 + mov r8, rcx + add r8, 32 + ; Add-Sub + ; Add + mov r12, QWORD PTR [r9] + mov r13, QWORD PTR [r9+8] + mov r14, QWORD PTR [r9+16] + mov r15, QWORD PTR [r9+24] + mov rdi, r12 + add r12, QWORD PTR [r10] + mov rsi, r13 + adc r13, QWORD PTR [r10+8] + mov rbx, r14 + adc r14, QWORD PTR [r10+16] + mov rbp, r15 + adc r15, QWORD PTR [r10+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r10] + sbb rsi, QWORD PTR [r10+8] + sbb rbx, QWORD PTR [r10+16] + sbb rbp, QWORD PTR [r10+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [r8], rdi + mov QWORD PTR [r8+8], rsi + mov QWORD PTR [r8+16], rbx + mov QWORD PTR [r8+24], rbp + mov r9, QWORD PTR [rsp+16] + add r9, 32 + add rcx, 32 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + add r10, 96 + add r9, 64 + add rcx, 64 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + sub r9, 96 + sub rcx, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [r8] + mov rsi, r13 + adc r13, QWORD PTR [r8+8] + mov rbx, r14 + adc r14, QWORD PTR [r8+16] + mov rbp, r15 + adc r15, QWORD PTR [r8+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r8] + sbb rsi, QWORD PTR [r8+8] + sbb rbx, QWORD PTR [r8+16] + sbb rbp, QWORD PTR [r8+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [r8], r12 + mov QWORD PTR [r8+8], r13 + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + sub r10, 32 + add r9, 64 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + add rcx, 64 + ; Double + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + mov r8, rcx + add r8, 32 + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [r8] + mov rsi, r13 + adc r13, QWORD PTR [r8+8] + mov rbx, r14 + adc r14, QWORD PTR [r8+16] + mov rbp, r15 + adc r15, QWORD PTR [r8+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r8] + sbb rsi, QWORD PTR [r8+8] + sbb rbx, QWORD PTR [r8+16] + sbb rbp, QWORD PTR [r8+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [r8], rdi + mov QWORD PTR [r8+8], rsi + mov QWORD PTR [r8+16], rbx + mov QWORD PTR [r8+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_add_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_sub_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r9, r8 + mov r8, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], r8 + mov QWORD PTR [rsp+16], r9 + mov r10, r8 + mov r9, r8 + add r9, 32 + mov r8, rcx + add r8, 32 + ; Add-Sub + ; Add + mov r12, QWORD PTR [r9] + mov r13, QWORD PTR [r9+8] + mov r14, QWORD PTR [r9+16] + mov r15, QWORD PTR [r9+24] + mov rdi, r12 + add r12, QWORD PTR [r10] + mov rsi, r13 + adc r13, QWORD PTR [r10+8] + mov rbx, r14 + adc r14, QWORD PTR [r10+16] + mov rbp, r15 + adc r15, QWORD PTR [r10+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r10] + sbb rsi, QWORD PTR [r10+8] + sbb rbx, QWORD PTR [r10+16] + sbb rbp, QWORD PTR [r10+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [r8], rdi + mov QWORD PTR [r8+8], rsi + mov QWORD PTR [r8+16], rbx + mov QWORD PTR [r8+24], rbp + mov r9, QWORD PTR [rsp+16] + add rcx, 32 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + add r10, 96 + add r9, 96 + add rcx, 64 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + sub r9, 64 + sub rcx, 96 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [rcx+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [rcx+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [rcx+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [rcx+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [r8] + mov rsi, r13 + adc r13, QWORD PTR [r8+8] + mov rbx, r14 + adc r14, QWORD PTR [r8+16] + mov rbp, r15 + adc r15, QWORD PTR [r8+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r8] + sbb rsi, QWORD PTR [r8+8] + sbb rbx, QWORD PTR [r8+16] + sbb rbp, QWORD PTR [r8+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [r8], r12 + mov QWORD PTR [r8+8], r13 + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + sub r10, 32 + add r9, 32 + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10] + mov r12, rax + mov r13, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10] + xor r14, r14 + add r13, rax + adc r14, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10] + add r14, rax + adc r15, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+8] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+16] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [r9] + mul QWORD PTR [r10+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+8] + xor rbx, rbx + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+16] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [r9+8] + mul QWORD PTR [r10+24] + add rdi, rax + adc rsi, rdx + adc rbx, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+16] + xor rbp, rbp + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [r9+16] + mul QWORD PTR [r10+24] + add rsi, rax + adc rbx, rdx + adc rbp, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [r9+24] + mul QWORD PTR [r10+24] + add rbx, rax + adc rbp, rdx + mov rax, 38 + mul rbp + add r15, rax + adc rdx, 0 + mov r11, 9223372036854775807 + shld rdx, r15, 1 + imul rdx, rdx, 19 + and r15, r11 + mov r11, rdx + mov rax, 38 + mul rdi + xor rdi, rdi + add r12, rax + mov rax, 38 + adc rdi, rdx + mul rsi + xor rsi, rsi + add r13, rax + mov rax, 38 + adc rsi, rdx + mul rbx + xor rbx, rbx + add r14, rax + adc rbx, rdx + add r12, r11 + adc r13, rdi + adc r14, rsi + adc r15, rbx + ; Store + ; Double + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + mov r8, rcx + add r8, 64 + add rcx, 96 + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [rcx] + mov rsi, r13 + adc r13, QWORD PTR [rcx+8] + mov rbx, r14 + adc r14, QWORD PTR [rcx+16] + mov rbp, r15 + adc r15, QWORD PTR [rcx+24] + mov r11, 0 + adc r11, 0 + shld r11, r15, 1 + imul r11, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rcx] + sbb rsi, QWORD PTR [rcx+8] + sbb rbx, QWORD PTR [rcx+16] + sbb rbp, QWORD PTR [rcx+24] + sbb r11, r11 + shld r11, rbp, 1 + imul r11, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, r11 + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [r8], rdi + mov QWORD PTR [r8+8], rsi + mov QWORD PTR [r8+16], rbx + mov QWORD PTR [r8+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_sub_x64 ENDP +_TEXT ENDS +IFDEF HAVE_ED25519 +_TEXT SEGMENT READONLY PARA +fe_sq2_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov r8, rdx + ; Square * 2 + ; A[0] * A[1] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+8] + mov r10, rax + mov r11, rdx + ; A[0] * A[2] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+16] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[0] * A[3] + mov rax, QWORD PTR [r8] + mul QWORD PTR [r8+24] + xor r13, r13 + add r12, rax + adc r13, rdx + ; A[1] * A[2] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+16] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[1] * A[3] + mov rax, QWORD PTR [r8+8] + mul QWORD PTR [r8+24] + add r13, rax + adc r14, rdx + ; A[2] * A[3] + mov rax, QWORD PTR [r8+16] + mul QWORD PTR [r8+24] + xor r15, r15 + add r14, rax + adc r15, rdx + ; Double + xor rdi, rdi + add r10, r10 + adc r11, r11 + adc r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + adc rdi, 0 + ; A[0] * A[0] + mov rax, QWORD PTR [r8] + mul rax + mov r9, rax + mov rsi, rdx + ; A[1] * A[1] + mov rax, QWORD PTR [r8+8] + mul rax + add r10, rsi + adc r11, rax + adc rdx, 0 + mov rsi, rdx + ; A[2] * A[2] + mov rax, QWORD PTR [r8+16] + mul rax + add r12, rsi + adc r13, rax + adc rdx, 0 + mov rsi, rdx + ; A[3] * A[3] + mov rax, QWORD PTR [r8+24] + mul rax + add r15, rax + adc rdi, rdx + add r14, rsi + adc r15, 0 + adc rdi, 0 + mov rax, 38 + mul rdi + add r12, rax + adc rdx, 0 + mov rsi, 9223372036854775807 + shld rdx, r12, 1 + imul rdx, rdx, 19 + and r12, rsi + mov rsi, rdx + mov rax, 38 + mul r13 + xor r13, r13 + add r9, rax + mov rax, 38 + adc r13, rdx + mul r14 + xor r14, r14 + add r10, rax + mov rax, 38 + adc r14, rdx + mul r15 + xor r15, r15 + add r11, rax + adc r15, rdx + add r9, rsi + adc r10, r13 + adc r11, r14 + adc r12, r15 + mov rax, r12 + shld r12, r11, 1 + shld r11, r10, 1 + shld r10, r9, 1 + shl r9, 1 + mov rsi, 9223372036854775807 + shr rax, 62 + and r12, rsi + imul rax, rax, 19 + add r9, rax + adc r10, 0 + adc r11, 0 + adc r12, 0 + ; Store + mov QWORD PTR [rcx], r9 + mov QWORD PTR [rcx+8], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_sq2_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +sc_reduce_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r9, QWORD PTR [rcx] + mov r10, QWORD PTR [rcx+8] + mov r11, QWORD PTR [rcx+16] + mov r12, QWORD PTR [rcx+24] + mov r13, QWORD PTR [rcx+32] + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx+48] + mov rdi, QWORD PTR [rcx+56] + mov r8, rdi + mov rsi, 1152921504606846975 + shr r8, 56 + shld rdi, r15, 4 + shld r15, r14, 4 + shld r14, r13, 4 + shld r13, r12, 4 + and r12, rsi + and rdi, rsi + ; Add order times bits 504..511 + sub r15, r8 + sbb rdi, 0 + mov rax, 16942830013509034793 + mul r8 + mov rsi, 0 + add r14, rax + mov rax, 12100500283911187475 + adc rsi, rdx + mul r8 + add r13, rax + adc r14, rdx + adc r15, rsi + adc rdi, 0 + ; Sub product of top 4 words and order + mov r8, 12100500283911187475 + mov rax, r13 + mul r8 + mov rbp, 0 + add r9, rax + adc rbp, rdx + mov rax, r14 + mul r8 + mov rsi, 0 + add r10, rax + adc rsi, rdx + mov rax, r15 + mul r8 + add r10, rbp + adc r11, rax + adc r12, rdx + mov rbx, 0 + adc rbx, 0 + mov rax, rdi + mul r8 + add r11, rsi + adc r12, rax + adc rbx, rdx + mov r8, 16942830013509034793 + mov rax, r13 + mul r8 + mov rbp, 0 + add r10, rax + adc rbp, rdx + mov rax, r14 + mul r8 + mov rsi, 0 + add r11, rax + adc rsi, rdx + mov rax, r15 + mul r8 + add r11, rbp + adc r12, rax + adc rbx, rdx + mov rbp, 0 + adc rbp, 0 + mov rax, rdi + mul r8 + add r12, rsi + adc rbx, rax + adc rbp, rdx + sub r11, r13 + mov r13, rbx + sbb r12, r14 + mov r14, rbp + sbb r13, r15 + sbb r14, rdi + mov r8, r14 + sar r8, 57 + ; Conditionally subtract order starting at bit 125 + mov rax, 11529215046068469760 + mov rdx, 14628338529006959229 + mov rbx, 187989257525064602 + mov rbp, 144115188075855872 + and rax, r8 + and rdx, r8 + and rbx, r8 + and rbp, r8 + add r10, rax + adc r11, rdx + adc r12, rbx + adc r13, 0 + adc r14, rbp + ; Move bits 252-376 to own registers + mov r8, 1152921504606846975 + shld r14, r13, 4 + shld r13, r12, 4 + and r12, r8 + ; Sub product of top 2 words and order + ; * -5812631a5cf5d3ed + mov r8, 12100500283911187475 + mov rax, r13 + mul r8 + mov rbx, 0 + add r9, rax + adc r10, rdx + adc rbx, 0 + mov rax, r14 + mul r8 + add r10, rax + adc rbx, rdx + ; * -14def9dea2f79cd7 + mov r8, 16942830013509034793 + mov rax, r13 + mul r8 + mov rbp, 0 + add r10, rax + adc r11, rdx + adc rbp, 0 + mov rax, r14 + mul r8 + add r11, rax + adc rbp, rdx + ; Add overflows at 2 * 64 + mov rsi, 1152921504606846975 + and r12, rsi + add r11, rbx + adc r12, rbp + ; Subtract top at 2 * 64 + sub r11, r13 + sbb r12, r14 + sbb rsi, rsi + ; Conditional sub order + mov rax, 6346243789798364141 + mov rdx, 1503914060200516822 + mov rbx, 1152921504606846976 + and rax, rsi + and rdx, rsi + and rbx, rsi + add r9, rax + mov rax, 1152921504606846975 + adc r10, rdx + adc r11, 0 + adc r12, rbx + and r12, rax + ; Store result + mov QWORD PTR [rcx], r9 + mov QWORD PTR [rcx+8], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sc_reduce_x64 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +sc_muladd_x64 PROC + push rbp + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rbp, r8 + mov r8, rdx + ; Multiply + ; A[0] * B[0] + mov rax, QWORD PTR [rbp] + mul QWORD PTR [r8] + mov r10, rax + mov r11, rdx + ; A[0] * B[1] + mov rax, QWORD PTR [rbp+8] + mul QWORD PTR [r8] + xor r12, r12 + add r11, rax + adc r12, rdx + ; A[1] * B[0] + mov rax, QWORD PTR [rbp] + mul QWORD PTR [r8+8] + xor r13, r13 + add r11, rax + adc r12, rdx + adc r13, 0 + ; A[0] * B[2] + mov rax, QWORD PTR [rbp+16] + mul QWORD PTR [r8] + add r12, rax + adc r13, rdx + ; A[1] * B[1] + mov rax, QWORD PTR [rbp+8] + mul QWORD PTR [r8+8] + xor r14, r14 + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[2] * B[0] + mov rax, QWORD PTR [rbp] + mul QWORD PTR [r8+16] + add r12, rax + adc r13, rdx + adc r14, 0 + ; A[0] * B[3] + mov rax, QWORD PTR [rbp+24] + mul QWORD PTR [r8] + xor r15, r15 + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[2] + mov rax, QWORD PTR [rbp+16] + mul QWORD PTR [r8+8] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[2] * B[1] + mov rax, QWORD PTR [rbp+8] + mul QWORD PTR [r8+16] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[3] * B[0] + mov rax, QWORD PTR [rbp] + mul QWORD PTR [r8+24] + add r13, rax + adc r14, rdx + adc r15, 0 + ; A[1] * B[3] + mov rax, QWORD PTR [rbp+24] + mul QWORD PTR [r8+8] + xor rdi, rdi + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[2] + mov rax, QWORD PTR [rbp+16] + mul QWORD PTR [r8+16] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[3] * B[1] + mov rax, QWORD PTR [rbp+8] + mul QWORD PTR [r8+24] + add r14, rax + adc r15, rdx + adc rdi, 0 + ; A[2] * B[3] + mov rax, QWORD PTR [rbp+24] + mul QWORD PTR [r8+16] + xor rsi, rsi + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[2] + mov rax, QWORD PTR [rbp+16] + mul QWORD PTR [r8+24] + add r15, rax + adc rdi, rdx + adc rsi, 0 + ; A[3] * B[3] + mov rax, QWORD PTR [rbp+24] + mul QWORD PTR [r8+24] + add rdi, rax + adc rsi, rdx + ; Add c to a * b + add r10, QWORD PTR [r9] + adc r11, QWORD PTR [r9+8] + adc r12, QWORD PTR [r9+16] + adc r13, QWORD PTR [r9+24] + adc r14, 0 + adc r15, 0 + adc rdi, 0 + adc rsi, 0 + mov rbx, rsi + mov r9, 1152921504606846975 + shr rbx, 56 + shld rsi, rdi, 4 + shld rdi, r15, 4 + shld r15, r14, 4 + shld r14, r13, 4 + and r13, r9 + and rsi, r9 + ; Add order times bits 504..507 + sub rdi, rbx + sbb rsi, 0 + mov rax, 16942830013509034793 + mul rbx + mov r9, 0 + add r15, rax + mov rax, 12100500283911187475 + adc r9, rdx + mul rbx + add r14, rax + adc r15, rdx + adc rdi, r9 + adc rsi, 0 + ; Sub product of top 4 words and order + mov rbx, 12100500283911187475 + mov rax, r14 + mul rbx + mov rbp, 0 + add r10, rax + adc rbp, rdx + mov rax, r15 + mul rbx + mov r9, 0 + add r11, rax + adc r9, rdx + mov rax, rdi + mul rbx + add r11, rbp + adc r12, rax + adc r13, rdx + mov r8, 0 + adc r8, 0 + mov rax, rsi + mul rbx + add r12, r9 + adc r13, rax + adc r8, rdx + mov rbx, 16942830013509034793 + mov rax, r14 + mul rbx + mov rbp, 0 + add r11, rax + adc rbp, rdx + mov rax, r15 + mul rbx + mov r9, 0 + add r12, rax + adc r9, rdx + mov rax, rdi + mul rbx + add r12, rbp + adc r13, rax + adc r8, rdx + mov rbp, 0 + adc rbp, 0 + mov rax, rsi + mul rbx + add r13, r9 + adc r8, rax + adc rbp, rdx + sub r12, r14 + mov r14, r8 + sbb r13, r15 + mov r15, rbp + sbb r14, rdi + sbb r15, rsi + mov rbx, r15 + sar rbx, 57 + ; Conditionally subtract order starting at bit 125 + mov rax, 11529215046068469760 + mov rdx, 14628338529006959229 + mov r8, 187989257525064602 + mov rbp, 144115188075855872 + and rax, rbx + and rdx, rbx + and r8, rbx + and rbp, rbx + add r11, rax + adc r12, rdx + adc r13, r8 + adc r14, 0 + adc r15, rbp + ; Move bits 252-376 to own registers + mov rbx, 1152921504606846975 + shld r15, r14, 4 + shld r14, r13, 4 + and r13, rbx + ; Sub product of top 2 words and order + ; * -5812631a5cf5d3ed + mov rbx, 12100500283911187475 + mov rax, r14 + mul rbx + mov r8, 0 + add r10, rax + adc r11, rdx + adc r8, 0 + mov rax, r15 + mul rbx + add r11, rax + adc r8, rdx + ; * -14def9dea2f79cd7 + mov rbx, 16942830013509034793 + mov rax, r14 + mul rbx + mov rbp, 0 + add r11, rax + adc r12, rdx + adc rbp, 0 + mov rax, r15 + mul rbx + add r12, rax + adc rbp, rdx + ; Add overflows at 2 * 64 + mov r9, 1152921504606846975 + and r13, r9 + add r12, r8 + adc r13, rbp + ; Subtract top at 2 * 64 + sub r12, r14 + sbb r13, r15 + sbb r9, r9 + ; Conditional sub order + mov rax, 6346243789798364141 + mov rdx, 1503914060200516822 + mov r8, 1152921504606846976 + and rax, r9 + and rdx, r9 + and r8, r9 + add r10, rax + mov rax, 1152921504606846975 + adc r11, rdx + adc r12, 0 + adc r13, r8 + and r13, rax + ; Store result + mov QWORD PTR [rcx], r10 + mov QWORD PTR [rcx+8], r11 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + ret +sc_muladd_x64 ENDP +_TEXT ENDS +; /* Non-constant time modular inversion. +; * +; * @param [out] r Resulting number. +; * @param [in] a Number to invert. +; * @return MP_OKAY on success. +; */ +_TEXT SEGMENT READONLY PARA +fe_invert_nct_x64 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + sub rsp, 513 + mov r9, -19 + mov r10, -1 + mov r11, -1 + mov r12, 9223372036854775807 + mov r13, QWORD PTR [rdx] + mov r14, QWORD PTR [rdx+8] + mov r15, QWORD PTR [rdx+16] + mov rdi, QWORD PTR [rdx+24] + mov rsi, 0 + test r13b, 1 + jnz fe_invert_nct_v_even_end +fe_invert_nct_v_even_start: + shrd r13, r14, 1 + shrd r14, r15, 1 + shrd r15, rdi, 1 + shr rdi, 1 + mov BYTE PTR [rsp+rsi], 1 + inc rsi + test r13b, 1 + jz fe_invert_nct_v_even_start +fe_invert_nct_v_even_end: +L_fe_invert_nct_uv_start: + cmp r12, rdi + jb L_fe_invert_nct_uv_v + ja L_fe_invert_nct_uv_u + cmp r11, r15 + jb L_fe_invert_nct_uv_v + ja L_fe_invert_nct_uv_u + cmp r10, r14 + jb L_fe_invert_nct_uv_v + ja L_fe_invert_nct_uv_u + cmp r9, r13 + jb L_fe_invert_nct_uv_v +L_fe_invert_nct_uv_u: + mov BYTE PTR [rsp+rsi], 2 + inc rsi + sub r9, r13 + sbb r10, r14 + sbb r11, r15 + sbb r12, rdi + shrd r9, r10, 1 + shrd r10, r11, 1 + shrd r11, r12, 1 + shr r12, 1 + test r9b, 1 + jnz fe_invert_nct_usubv_even_end +fe_invert_nct_usubv_even_start: + shrd r9, r10, 1 + shrd r10, r11, 1 + shrd r11, r12, 1 + shr r12, 1 + mov BYTE PTR [rsp+rsi], 0 + inc rsi + test r9b, 1 + jz fe_invert_nct_usubv_even_start +fe_invert_nct_usubv_even_end: + cmp r9, 1 + jne L_fe_invert_nct_uv_start + mov rax, r10 + or rax, r11 + jne L_fe_invert_nct_uv_start + or rax, r12 + jne L_fe_invert_nct_uv_start + mov r8b, 1 + jmp L_fe_invert_nct_uv_end +L_fe_invert_nct_uv_v: + mov BYTE PTR [rsp+rsi], 3 + inc rsi + sub r13, r9 + sbb r14, r10 + sbb r15, r11 + sbb rdi, r12 + shrd r13, r14, 1 + shrd r14, r15, 1 + shrd r15, rdi, 1 + shr rdi, 1 + test r13b, 1 + jnz fe_invert_nct_vsubu_even_end +fe_invert_nct_vsubu_even_start: + shrd r13, r14, 1 + shrd r14, r15, 1 + shrd r15, rdi, 1 + shr rdi, 1 + mov BYTE PTR [rsp+rsi], 1 + inc rsi + test r13b, 1 + jz fe_invert_nct_vsubu_even_start +fe_invert_nct_vsubu_even_end: + cmp r13, 1 + jne L_fe_invert_nct_uv_start + mov rax, r14 + or rax, r15 + jne L_fe_invert_nct_uv_start + or rax, rdi + jne L_fe_invert_nct_uv_start + mov r8b, 0 +L_fe_invert_nct_uv_end: + mov r9, -19 + mov r10, -1 + mov r11, -1 + mov r12, 9223372036854775807 + mov r13, 1 + xor r14, r14 + xor r15, r15 + xor rdi, rdi + mov BYTE PTR [rsp+rsi], 7 + mov al, BYTE PTR [rsp] + mov rsi, 1 + cmp al, 1 + je L_fe_invert_nct_op_div2_d + jl L_fe_invert_nct_op_div2_b + cmp al, 3 + je L_fe_invert_nct_op_d_sub_b + jl L_fe_invert_nct_op_b_sub_d + jmp L_fe_invert_nct_op_end +L_fe_invert_nct_op_b_sub_d: + sub r9, r13 + sbb r10, r14 + sbb r11, r15 + sbb r12, rdi + jnc L_fe_invert_nct_op_div2_b + mov rax, -1 + add r9, -19 + adc r10, rax + adc r11, rax + mov rax, 9223372036854775807 + adc r12, rax +L_fe_invert_nct_op_div2_b: + test r9b, 1 + jz L_fe_invert_nct_op_div2_b_mod + add r9, -19 + mov rax, -1 + adc r10, rax + adc r11, rax + mov rax, 9223372036854775807 + adc r12, rax +L_fe_invert_nct_op_div2_b_mod: + shrd r9, r10, 1 + shrd r10, r11, 1 + shrd r11, r12, 1 + shr r12, 1 + mov al, BYTE PTR [rsp+rsi] + inc rsi + cmp al, 1 + je L_fe_invert_nct_op_div2_d + jl L_fe_invert_nct_op_div2_b + cmp al, 3 + je L_fe_invert_nct_op_d_sub_b + jl L_fe_invert_nct_op_b_sub_d + jmp L_fe_invert_nct_op_end +L_fe_invert_nct_op_d_sub_b: + sub r13, r9 + sbb r14, r10 + sbb r15, r11 + sbb rdi, r12 + jnc L_fe_invert_nct_op_div2_d + mov rax, -1 + add r13, -19 + adc r14, rax + adc r15, rax + mov rax, 9223372036854775807 + adc rdi, rax +L_fe_invert_nct_op_div2_d: + test r13b, 1 + jz L_fe_invert_nct_op_div2_d_mod + add r13, -19 + mov rax, -1 + adc r14, rax + adc r15, rax + mov rax, 9223372036854775807 + adc rdi, rax +L_fe_invert_nct_op_div2_d_mod: + shrd r13, r14, 1 + shrd r14, r15, 1 + shrd r15, rdi, 1 + shr rdi, 1 + mov al, BYTE PTR [rsp+rsi] + inc rsi + cmp al, 1 + je L_fe_invert_nct_op_div2_d + jl L_fe_invert_nct_op_div2_b + cmp al, 3 + je L_fe_invert_nct_op_d_sub_b + jl L_fe_invert_nct_op_b_sub_d +L_fe_invert_nct_op_end: + cmp r8b, 1 + jne L_fe_invert_nct_store_d + mov QWORD PTR [rcx], r9 + mov QWORD PTR [rcx+8], r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+24], r12 + jmp L_fe_invert_nct_store_end +L_fe_invert_nct_store_d: + mov QWORD PTR [rcx], r13 + mov QWORD PTR [rcx+8], r14 + mov QWORD PTR [rcx+16], r15 + mov QWORD PTR [rcx+24], rdi +L_fe_invert_nct_store_end: + add rsp, 513 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_invert_nct_x64 ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX2 +_TEXT SEGMENT READONLY PARA +fe_cmov_table_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov r9, rdx + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + xor rbx, rbx + movsx rax, r8b + cdq + xor al, dl + sub al, dl + mov bl, al + movd xmm7, ebx + mov rbx, 1 + movd xmm9, rbx + vmovdqa ymm3, ymm9 + vmovdqa ymm4, ymm9 + vpxor ymm8, ymm8, ymm8 + vpermd ymm7, ymm8, ymm7 + vpermd ymm9, ymm8, ymm9 + vpxor ymm0, ymm0, ymm0 + vpxor ymm1, ymm1, ymm1 + vpxor ymm2, ymm2, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpxor ymm5, ymm5, ymm5 + vpand ymm3, ymm3, ymm6 + vpand ymm4, ymm4, ymm6 + vmovdqa ymm8, ymm9 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9] + vmovupd ymm1, YMMWORD PTR [r9+32] + vmovupd ymm2, YMMWORD PTR [r9+64] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9+96] + vmovupd ymm1, YMMWORD PTR [r9+128] + vmovupd ymm2, YMMWORD PTR [r9+160] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9+192] + vmovupd ymm1, YMMWORD PTR [r9+224] + vmovupd ymm2, YMMWORD PTR [r9+256] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9+288] + vmovupd ymm1, YMMWORD PTR [r9+320] + vmovupd ymm2, YMMWORD PTR [r9+352] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9+384] + vmovupd ymm1, YMMWORD PTR [r9+416] + vmovupd ymm2, YMMWORD PTR [r9+448] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9+480] + vmovupd ymm1, YMMWORD PTR [r9+512] + vmovupd ymm2, YMMWORD PTR [r9+544] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9+576] + vmovupd ymm1, YMMWORD PTR [r9+608] + vmovupd ymm2, YMMWORD PTR [r9+640] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + vpcmpeqd ymm6, ymm8, ymm7 + vpaddd ymm8, ymm8, ymm9 + vmovupd ymm0, YMMWORD PTR [r9+672] + vmovupd ymm1, YMMWORD PTR [r9+704] + vmovupd ymm2, YMMWORD PTR [r9+736] + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpand ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm0 + vpor ymm4, ymm4, ymm1 + vpor ymm5, ymm5, ymm2 + movsx rax, r8b + sar rax, 63 + vmovd xmm6, eax + vpxor ymm8, ymm8, ymm8 + vpermd ymm6, ymm8, ymm6 + vpxor ymm8, ymm3, ymm4 + vpand ymm8, ymm8, ymm6 + vpxor ymm3, ymm3, ymm8 + vpxor ymm4, ymm4, ymm8 + vmovupd YMMWORD PTR [rcx], ymm3 + vmovupd YMMWORD PTR [rcx+32], ymm4 + vmovupd YMMWORD PTR [rcx+64], ymm5 + mov r10, QWORD PTR [rcx+64] + mov r11, QWORD PTR [rcx+72] + mov r12, QWORD PTR [rcx+80] + mov r13, QWORD PTR [rcx+88] + mov r14, -19 + mov r15, -1 + mov rdi, -1 + mov rsi, 9223372036854775807 + sub r14, r10 + sbb r15, r11 + sbb rdi, r12 + sbb rsi, r13 + cmp r8b, 0 + cmovl r10, r14 + cmovl r11, r15 + cmovl r12, rdi + cmovl r13, rsi + mov QWORD PTR [rcx+64], r10 + mov QWORD PTR [rcx+72], r11 + mov QWORD PTR [rcx+80], r12 + mov QWORD PTR [rcx+88], r13 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_cmov_table_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_mul_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rsi, rdx + mov rbp, r8 + mov rbx, QWORD PTR [rsi] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rbp] + mulx r9, r8, rbx + ; A[2] * B[0] + mulx r11, r10, QWORD PTR [rsi+16] + ; A[1] * B[0] + mulx rcx, rax, QWORD PTR [rsi+8] + xor r15, r15 + adcx r9, rax + ; A[3] * B[1] + mov rdx, QWORD PTR [rbp+8] + mulx r13, r12, QWORD PTR [rsi+24] + adcx r10, rcx + ; A[0] * B[1] + mulx rcx, rax, rbx + adox r9, rax + ; A[2] * B[1] + mulx r14, rax, QWORD PTR [rsi+16] + adox r10, rcx + adcx r11, rax + ; A[1] * B[2] + mov rdx, QWORD PTR [rbp+16] + mulx rcx, rax, QWORD PTR [rsi+8] + adcx r12, r14 + adox r11, rax + adcx r13, r15 + adox r12, rcx + ; A[0] * B[2] + mulx rcx, rax, rbx + adox r13, r15 + xor r14, r14 + adcx r10, rax + ; A[1] * B[1] + mov rdx, QWORD PTR [rbp+8] + mulx rax, rdx, QWORD PTR [rsi+8] + adcx r11, rcx + adox r10, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rbp+24] + adox r11, rax + mulx rcx, rax, QWORD PTR [rsi+8] + adcx r12, rax + ; A[2] * B[2] + mov rdx, QWORD PTR [rbp+16] + mulx rax, rdx, QWORD PTR [rsi+16] + adcx r13, rcx + adox r12, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rbp+24] + adox r13, rax + mulx rcx, rax, QWORD PTR [rsi+24] + adox r14, r15 + adcx r14, rax + ; A[0] * B[3] + mulx rax, rdx, rbx + adcx r15, rcx + xor rcx, rcx + adcx r11, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsi+24] + adcx r12, rax + mulx rax, rdx, QWORD PTR [rbp] + adox r11, rdx + adox r12, rax + ; A[3] * B[2] + mov rdx, QWORD PTR [rsi+24] + mulx rax, rdx, QWORD PTR [rbp+16] + adcx r13, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rbp+24] + adcx r14, rax + mulx rdx, rax, QWORD PTR [rsi+16] + adcx r15, rcx + adox r13, rax + adox r14, rdx + adox r15, rcx + mov rdx, 38 + mulx rax, r15, r15 + add r11, r15 + adc rax, 0 + mov rcx, 9223372036854775807 + shld rax, r11, 1 + imul rax, rax, 19 + and r11, rcx + xor rcx, rcx + adox r8, rax + mulx r12, rax, r12 + adcx r8, rax + adox r9, r12 + mulx r13, rax, r13 + adcx r9, rax + adox r10, r13 + mulx r14, rax, r14 + adcx r10, rax + adox r11, r14 + adcx r11, rcx + mov rcx, 9223372036854775807 + mov rdx, r11 + sar rdx, 63 + and rdx, 19 + and r11, rcx + add r8, rdx + adc r9, 0 + adc r10, 0 + adc r11, 0 + ; Store + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +fe_mul_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_sq_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + ; Square + mov rdx, QWORD PTR [rsi] + mov rax, QWORD PTR [rsi+8] + ; A[0] * A[1] + mov r15, rdx + mulx r10, r9, rax + ; A[0] * A[3] + mulx r12, r11, QWORD PTR [rsi+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsi+16] + mulx rbx, rcx, rax + xor r8, r8 + adox r11, rcx + ; A[2] * A[3] + mulx r14, r13, QWORD PTR [rsi+24] + adox r12, rbx + ; A[2] * A[0] + mulx rbx, rcx, r15 + adox r13, r8 + adcx r10, rcx + adox r14, r8 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsi+24] + adcx r11, rbx + adcx r12, rcx + adcx r13, rdx + adcx r14, r8 + ; A[0] * A[0] + mov rdx, r15 + mulx rcx, r8, rdx + xor r15, r15 + adcx r9, r9 + ; A[1] * A[1] + mov rdx, rax + adox r9, rcx + mulx rbx, rcx, rdx + adcx r10, r10 + adox r10, rcx + adcx r11, r11 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsi+16] + adox r11, rbx + mulx rcx, rbx, rdx + adcx r12, r12 + adox r12, rbx + adcx r13, r13 + ; A[3] * A[3] + mov rdx, QWORD PTR [rsi+24] + adox r13, rcx + mulx rbx, rcx, rdx + adcx r14, r14 + adox r14, rcx + adcx r15, r15 + adox r15, rbx + mov rdx, 38 + mulx rbx, r15, r15 + add r11, r15 + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r11, 1 + imul rbx, rbx, 19 + and r11, rcx + xor rcx, rcx + adox r8, rbx + mulx r12, rbx, r12 + adcx r8, rbx + adox r9, r12 + mulx r13, rbx, r13 + adcx r9, rbx + adox r10, r13 + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + adcx r11, rcx + mov rcx, 9223372036854775807 + mov rdx, r11 + sar rdx, 63 + and rdx, 19 + and r11, rcx + add r8, rdx + adc r9, 0 + adc r10, 0 + adc r11, 0 + ; Store + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +fe_sq_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_sq_n_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rsi, rdx + mov rbp, r8 +L_fe_sq_n_avx2: + ; Square + mov rdx, QWORD PTR [rsi] + mov rax, QWORD PTR [rsi+8] + ; A[0] * A[1] + mov r15, rdx + mulx r10, r9, rax + ; A[0] * A[3] + mulx r12, r11, QWORD PTR [rsi+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsi+16] + mulx rbx, rcx, rax + xor r8, r8 + adox r11, rcx + ; A[2] * A[3] + mulx r14, r13, QWORD PTR [rsi+24] + adox r12, rbx + ; A[2] * A[0] + mulx rbx, rcx, r15 + adox r13, r8 + adcx r10, rcx + adox r14, r8 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsi+24] + adcx r11, rbx + adcx r12, rcx + adcx r13, rdx + adcx r14, r8 + ; A[0] * A[0] + mov rdx, r15 + mulx rcx, r8, rdx + xor r15, r15 + adcx r9, r9 + ; A[1] * A[1] + mov rdx, rax + adox r9, rcx + mulx rbx, rcx, rdx + adcx r10, r10 + adox r10, rcx + adcx r11, r11 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsi+16] + adox r11, rbx + mulx rcx, rbx, rdx + adcx r12, r12 + adox r12, rbx + adcx r13, r13 + ; A[3] * A[3] + mov rdx, QWORD PTR [rsi+24] + adox r13, rcx + mulx rbx, rcx, rdx + adcx r14, r14 + adox r14, rcx + adcx r15, r15 + adox r15, rbx + mov rdx, 38 + mulx rbx, r15, r15 + add r11, r15 + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r11, 1 + imul rbx, rbx, 19 + and r11, rcx + xor rcx, rcx + adox r8, rbx + mulx r12, rbx, r12 + adcx r8, rbx + adox r9, r12 + mulx r13, rbx, r13 + adcx r9, rbx + adox r10, r13 + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + adcx r11, rcx + ; Store + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + dec bpl + jnz L_fe_sq_n_avx2 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +fe_sq_n_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_mul121666_avx2 PROC + push r12 + push r13 + push r14 + push r15 + mov rax, rdx + mov rdx, 121666 + mulx r15, r8, QWORD PTR [rax] + mulx r14, r9, QWORD PTR [rax+8] + mulx r13, r10, QWORD PTR [rax+16] + add r9, r15 + mulx r12, r11, QWORD PTR [rax+24] + adc r10, r14 + adc r11, r13 + adc r12, 0 + shld r12, r11, 1 + btr r11, 63 + imul r12, r12, 19 + add r8, r12 + adc r9, 0 + adc r10, 0 + adc r11, 0 + mov QWORD PTR [rcx], r8 + mov QWORD PTR [rcx+8], r9 + mov QWORD PTR [rcx+16], r10 + mov QWORD PTR [rcx+24], r11 + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_mul121666_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_invert_avx2 PROC + sub rsp, 144 + ; Invert + mov QWORD PTR [rsp+128], rcx + mov QWORD PTR [rsp+136], rdx + mov rcx, rsp + mov rdx, QWORD PTR [rsp+136] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + mov rdx, QWORD PTR [rsp+136] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + mov rcx, rsp + mov rdx, rsp + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 4 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 9 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 19 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 9 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 49 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 99 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 49 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 4 + call fe_sq_n_avx2 + mov rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_avx2 + mov rdx, QWORD PTR [rsp+136] + mov rcx, QWORD PTR [rsp+128] + add rsp, 144 + ret +fe_invert_avx2 ENDP +_TEXT ENDS +IFDEF WOLFSSL_CURVE25519_NOT_USE_ED25519 +_DATA SEGMENT +ALIGN 16 +L_curve25519_base_avx2_x2 QWORD 5cae469cdd684efbh, 8f3f5ced1e350b5ch + QWORD 0d9750c687d157114h, 20d342d51873f1b7h +ptr_L_curve25519_base_avx2_x2 QWORD L_curve25519_base_avx2_x2 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +curve25519_base_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov r8, rcx + mov r9, rdx + sub rsp, 176 + mov QWORD PTR [rsp+168], 0 + mov QWORD PTR [rsp+160], r8 + ; Set base point x + mov QWORD PTR [r8], 9 + mov QWORD PTR [r8+8], 0 + mov QWORD PTR [r8+16], 0 + mov QWORD PTR [r8+24], 0 + ; Set one + mov QWORD PTR [rsp], 1 + mov QWORD PTR [rsp+8], 0 + mov QWORD PTR [rsp+16], 0 + mov QWORD PTR [rsp+24], 0 + mov r10, QWORD PTR [ptr_L_curve25519_base_avx2_x2] + mov r11, QWORD PTR [ptr_L_curve25519_base_avx2_x2+8] + mov r12, QWORD PTR [ptr_L_curve25519_base_avx2_x2+16] + mov r13, QWORD PTR [ptr_L_curve25519_base_avx2_x2+24] + ; Set one + mov QWORD PTR [rsp+32], 1 + mov QWORD PTR [rsp+40], 0 + mov QWORD PTR [rsp+48], 0 + mov QWORD PTR [rsp+56], 0 + mov QWORD PTR [rsp+64], r10 + mov QWORD PTR [rsp+72], r11 + mov QWORD PTR [rsp+80], r12 + mov QWORD PTR [rsp+88], r13 + mov rbp, 253 +L_curve25519_base_avx2_bits: + mov rax, QWORD PTR [rsp+168] + mov rbx, rbp + mov rcx, rbp + shr rbx, 6 + and rcx, 63 + mov rbx, QWORD PTR [r9+8*rbx] + shr rbx, cl + and rbx, 1 + xor rax, rbx + neg rax + ; Conditional Swap + mov r10, QWORD PTR [r8] + mov r11, QWORD PTR [r8+8] + mov r12, QWORD PTR [r8+16] + mov r13, QWORD PTR [r8+24] + mov r14, QWORD PTR [rsp] + mov r15, QWORD PTR [rsp+8] + mov rdi, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + xor r10, QWORD PTR [rsp+64] + xor r11, QWORD PTR [rsp+72] + xor r12, QWORD PTR [rsp+80] + xor r13, QWORD PTR [rsp+88] + xor r14, QWORD PTR [rsp+32] + xor r15, QWORD PTR [rsp+40] + xor rdi, QWORD PTR [rsp+48] + xor rsi, QWORD PTR [rsp+56] + and r10, rax + and r11, rax + and r12, rax + and r13, rax + and r14, rax + and r15, rax + and rdi, rax + and rsi, rax + xor QWORD PTR [r8], r10 + xor QWORD PTR [r8+8], r11 + xor QWORD PTR [r8+16], r12 + xor QWORD PTR [r8+24], r13 + xor QWORD PTR [rsp], r14 + xor QWORD PTR [rsp+8], r15 + xor QWORD PTR [rsp+16], rdi + xor QWORD PTR [rsp+24], rsi + xor QWORD PTR [rsp+64], r10 + xor QWORD PTR [rsp+72], r11 + xor QWORD PTR [rsp+80], r12 + xor QWORD PTR [rsp+88], r13 + xor QWORD PTR [rsp+32], r14 + xor QWORD PTR [rsp+40], r15 + xor QWORD PTR [rsp+48], rdi + xor QWORD PTR [rsp+56], rsi + mov QWORD PTR [rsp+168], rbx + ; Add-Sub + ; Add + mov r10, QWORD PTR [r8] + mov r11, QWORD PTR [r8+8] + mov r12, QWORD PTR [r8+16] + mov r13, QWORD PTR [r8+24] + mov r14, r10 + add r10, QWORD PTR [rsp] + mov r15, r11 + adc r11, QWORD PTR [rsp+8] + mov rdi, r12 + adc r12, QWORD PTR [rsp+16] + mov rsi, r13 + adc r13, QWORD PTR [rsp+24] + mov rbx, 0 + adc rbx, 0 + shld rbx, r13, 1 + imul rbx, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add r10, rbx + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp] + sbb r15, QWORD PTR [rsp+8] + sbb rdi, QWORD PTR [rsp+16] + sbb rsi, QWORD PTR [rsp+24] + sbb rbx, rbx + shld rbx, rsi, 1 + imul rbx, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbx + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [r8], r10 + mov QWORD PTR [r8+8], r11 + mov QWORD PTR [r8+16], r12 + mov QWORD PTR [r8+24], r13 + mov QWORD PTR [rsp+128], r14 + mov QWORD PTR [rsp+136], r15 + mov QWORD PTR [rsp+144], rdi + mov QWORD PTR [rsp+152], rsi + ; Add-Sub + ; Add + mov r10, QWORD PTR [rsp+64] + mov r11, QWORD PTR [rsp+72] + mov r12, QWORD PTR [rsp+80] + mov r13, QWORD PTR [rsp+88] + mov r14, r10 + add r10, QWORD PTR [rsp+32] + mov r15, r11 + adc r11, QWORD PTR [rsp+40] + mov rdi, r12 + adc r12, QWORD PTR [rsp+48] + mov rsi, r13 + adc r13, QWORD PTR [rsp+56] + mov rbx, 0 + adc rbx, 0 + shld rbx, r13, 1 + imul rbx, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add r10, rbx + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp+32] + sbb r15, QWORD PTR [rsp+40] + sbb rdi, QWORD PTR [rsp+48] + sbb rsi, QWORD PTR [rsp+56] + sbb rbx, rbx + shld rbx, rsi, 1 + imul rbx, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbx + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [rsp+32], r10 + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + mov QWORD PTR [rsp+96], r14 + mov QWORD PTR [rsp+104], r15 + mov QWORD PTR [rsp+112], rdi + mov QWORD PTR [rsp+120], rsi + mov rax, QWORD PTR [rsp+32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+128] + mulx r11, r10, rax + ; A[2] * B[0] + mulx r13, r12, QWORD PTR [rsp+48] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+40] + xor rsi, rsi + adcx r11, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx r15, r14, QWORD PTR [rsp+56] + adcx r12, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r11, rcx + ; A[2] * B[1] + mulx rdi, rcx, QWORD PTR [rsp+48] + adox r12, rbx + adcx r13, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, QWORD PTR [rsp+40] + adcx r14, rdi + adox r13, rcx + adcx r15, rsi + adox r14, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox r15, rsi + xor rdi, rdi + adcx r12, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx rcx, rdx, QWORD PTR [rsp+40] + adcx r13, rbx + adox r12, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox r13, rcx + mulx rbx, rcx, QWORD PTR [rsp+40] + adcx r14, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rcx, rdx, QWORD PTR [rsp+48] + adcx r15, rbx + adox r14, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox r15, rcx + mulx rbx, rcx, QWORD PTR [rsp+56] + adox rdi, rsi + adcx rdi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rsi, rbx + xor rbx, rbx + adcx r13, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+56] + adcx r14, rcx + mulx rcx, rdx, QWORD PTR [rsp+128] + adox r13, rdx + adox r14, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+56] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx r15, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+152] + adcx rdi, rcx + mulx rdx, rcx, QWORD PTR [rsp+48] + adcx rsi, rbx + adox r15, rcx + adox rdi, rdx + adox rsi, rbx + mov rdx, 38 + mulx rcx, rsi, rsi + add r13, rsi + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r13, 1 + imul rcx, rcx, 19 + and r13, rbx + xor rbx, rbx + adox r10, rcx + mulx r14, rcx, r14 + adcx r10, rcx + adox r11, r14 + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + adcx r13, rbx + ; Store + mov QWORD PTR [rsp+32], r10 + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + mov rax, QWORD PTR [rsp+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r11, r10, rax + ; A[2] * B[0] + mulx r13, r12, QWORD PTR [rsp+112] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+104] + xor rsi, rsi + adcx r11, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r15, r14, QWORD PTR [rsp+120] + adcx r12, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r11, rcx + ; A[2] * B[1] + mulx rdi, rcx, QWORD PTR [rsp+112] + adox r12, rbx + adcx r13, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r14, rdi + adox r13, rcx + adcx r15, rsi + adox r14, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox r15, rsi + xor rdi, rdi + adcx r12, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rcx, rdx, QWORD PTR [rsp+104] + adcx r13, rbx + adox r12, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r13, rcx + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r14, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx r15, rbx + adox r14, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, rcx + mulx rbx, rcx, QWORD PTR [rsp+120] + adox rdi, rsi + adcx rdi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rsi, rbx + xor rbx, rbx + adcx r13, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+120] + adcx r14, rcx + mulx rcx, rdx, QWORD PTR [r8] + adox r13, rdx + adox r14, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+120] + mulx rcx, rdx, QWORD PTR [r8+16] + adcx r15, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rdi, rcx + mulx rdx, rcx, QWORD PTR [rsp+112] + adcx rsi, rbx + adox r15, rcx + adox rdi, rdx + adox rsi, rbx + mov rdx, 38 + mulx rcx, rsi, rsi + add r13, rsi + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r13, 1 + imul rcx, rcx, 19 + and r13, rbx + xor rbx, rbx + adox r10, rcx + mulx r14, rcx, r14 + adcx r10, rcx + adox r11, r14 + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + adcx r13, rbx + ; Store + mov QWORD PTR [rsp], r10 + mov QWORD PTR [rsp+8], r11 + mov QWORD PTR [rsp+16], r12 + mov QWORD PTR [rsp+24], r13 + ; Square + mov rdx, QWORD PTR [rsp+128] + mov rax, QWORD PTR [rsp+136] + ; A[0] * A[1] + mov rsi, rdx + mulx r12, r11, rax + ; A[0] * A[3] + mulx r14, r13, QWORD PTR [rsp+152] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, rax + xor r10, r10 + adox r13, rcx + ; A[2] * A[3] + mulx rdi, r15, QWORD PTR [rsp+152] + adox r14, rbx + ; A[2] * A[0] + mulx rbx, rcx, rsi + adox r15, r10 + adcx r12, rcx + adox rdi, r10 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+152] + adcx r13, rbx + adcx r14, rcx + adcx r15, rdx + adcx rdi, r10 + ; A[0] * A[0] + mov rdx, rsi + mulx rcx, r10, rdx + xor rsi, rsi + adcx r11, r11 + ; A[1] * A[1] + mov rdx, rax + adox r11, rcx + mulx rbx, rcx, rdx + adcx r12, r12 + adox r12, rcx + adcx r13, r13 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+144] + adox r13, rbx + mulx rcx, rbx, rdx + adcx r14, r14 + adox r14, rbx + adcx r15, r15 + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+152] + adox r15, rcx + mulx rbx, rcx, rdx + adcx rdi, rdi + adox rdi, rcx + adcx rsi, rsi + adox rsi, rbx + mov rdx, 38 + mulx rbx, rsi, rsi + add r13, rsi + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r13, 1 + imul rbx, rbx, 19 + and r13, rcx + xor rcx, rcx + adox r10, rbx + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + adcx r13, rcx + ; Store + mov QWORD PTR [rsp+96], r10 + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + ; Square + mov rdx, QWORD PTR [r8] + mov rax, QWORD PTR [r8+8] + ; A[0] * A[1] + mov rsi, rdx + mulx r12, r11, rax + ; A[0] * A[3] + mulx r14, r13, QWORD PTR [r8+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [r8+16] + mulx rbx, rcx, rax + xor r10, r10 + adox r13, rcx + ; A[2] * A[3] + mulx rdi, r15, QWORD PTR [r8+24] + adox r14, rbx + ; A[2] * A[0] + mulx rbx, rcx, rsi + adox r15, r10 + adcx r12, rcx + adox rdi, r10 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [r8+24] + adcx r13, rbx + adcx r14, rcx + adcx r15, rdx + adcx rdi, r10 + ; A[0] * A[0] + mov rdx, rsi + mulx rcx, r10, rdx + xor rsi, rsi + adcx r11, r11 + ; A[1] * A[1] + mov rdx, rax + adox r11, rcx + mulx rbx, rcx, rdx + adcx r12, r12 + adox r12, rcx + adcx r13, r13 + ; A[2] * A[2] + mov rdx, QWORD PTR [r8+16] + adox r13, rbx + mulx rcx, rbx, rdx + adcx r14, r14 + adox r14, rbx + adcx r15, r15 + ; A[3] * A[3] + mov rdx, QWORD PTR [r8+24] + adox r15, rcx + mulx rbx, rcx, rdx + adcx rdi, rdi + adox rdi, rcx + adcx rsi, rsi + adox rsi, rbx + mov rdx, 38 + mulx rbx, rsi, rsi + add r13, rsi + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r13, 1 + imul rbx, rbx, 19 + and r13, rcx + xor rcx, rcx + adox r10, rbx + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + adcx r13, rcx + ; Store + mov QWORD PTR [rsp+128], r10 + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + ; Add-Sub + ; Add + mov r10, QWORD PTR [rsp] + mov r11, QWORD PTR [rsp+8] + mov r12, QWORD PTR [rsp+16] + mov r13, QWORD PTR [rsp+24] + mov r14, r10 + add r10, QWORD PTR [rsp+32] + mov r15, r11 + adc r11, QWORD PTR [rsp+40] + mov rdi, r12 + adc r12, QWORD PTR [rsp+48] + mov rsi, r13 + adc r13, QWORD PTR [rsp+56] + mov rbx, 0 + adc rbx, 0 + shld rbx, r13, 1 + imul rbx, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add r10, rbx + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp+32] + sbb r15, QWORD PTR [rsp+40] + sbb rdi, QWORD PTR [rsp+48] + sbb rsi, QWORD PTR [rsp+56] + sbb rbx, rbx + shld rbx, rsi, 1 + imul rbx, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbx + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [rsp+64], r10 + mov QWORD PTR [rsp+72], r11 + mov QWORD PTR [rsp+80], r12 + mov QWORD PTR [rsp+88], r13 + mov QWORD PTR [rsp+32], r14 + mov QWORD PTR [rsp+40], r15 + mov QWORD PTR [rsp+48], rdi + mov QWORD PTR [rsp+56], rsi + mov rax, QWORD PTR [rsp+128] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+96] + mulx r11, r10, rax + ; A[2] * B[0] + mulx r13, r12, QWORD PTR [rsp+144] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+136] + xor rsi, rsi + adcx r11, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx r15, r14, QWORD PTR [rsp+152] + adcx r12, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r11, rcx + ; A[2] * B[1] + mulx rdi, rcx, QWORD PTR [rsp+144] + adox r12, rbx + adcx r13, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rdi + adox r13, rcx + adcx r15, rsi + adox r14, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox r15, rsi + xor rdi, rdi + adcx r12, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rcx, rdx, QWORD PTR [rsp+136] + adcx r13, rbx + adox r12, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r13, rcx + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx r15, rbx + adox r14, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r15, rcx + mulx rbx, rcx, QWORD PTR [rsp+152] + adox rdi, rsi + adcx rdi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rsi, rbx + xor rbx, rbx + adcx r13, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+152] + adcx r14, rcx + mulx rcx, rdx, QWORD PTR [rsp+96] + adox r13, rdx + adox r14, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+152] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx r15, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+120] + adcx rdi, rcx + mulx rdx, rcx, QWORD PTR [rsp+144] + adcx rsi, rbx + adox r15, rcx + adox rdi, rdx + adox rsi, rbx + mov rdx, 38 + mulx rcx, rsi, rsi + add r13, rsi + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r13, 1 + imul rcx, rcx, 19 + and r13, rbx + xor rbx, rbx + adox r10, rcx + mulx r14, rcx, r14 + adcx r10, rcx + adox r11, r14 + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + adcx r13, rbx + ; Store + mov QWORD PTR [r8], r10 + mov QWORD PTR [r8+8], r11 + mov QWORD PTR [r8+16], r12 + mov QWORD PTR [r8+24], r13 + ; Sub + mov r10, QWORD PTR [rsp+128] + mov r11, QWORD PTR [rsp+136] + mov r12, QWORD PTR [rsp+144] + mov r13, QWORD PTR [rsp+152] + sub r10, QWORD PTR [rsp+96] + sbb r11, QWORD PTR [rsp+104] + sbb r12, QWORD PTR [rsp+112] + sbb r13, QWORD PTR [rsp+120] + sbb rbx, rbx + shld rbx, r13, 1 + imul rbx, -19 + btr r13, 63 + ; Add modulus (if underflow) + sub r10, rbx + sbb r11, 0 + sbb r12, 0 + sbb r13, 0 + mov QWORD PTR [rsp+128], r10 + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + ; Square + mov rdx, QWORD PTR [rsp+32] + mov rax, QWORD PTR [rsp+40] + ; A[0] * A[1] + mov rsi, rdx + mulx r12, r11, rax + ; A[0] * A[3] + mulx r14, r13, QWORD PTR [rsp+56] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+48] + mulx rbx, rcx, rax + xor r10, r10 + adox r13, rcx + ; A[2] * A[3] + mulx rdi, r15, QWORD PTR [rsp+56] + adox r14, rbx + ; A[2] * A[0] + mulx rbx, rcx, rsi + adox r15, r10 + adcx r12, rcx + adox rdi, r10 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+56] + adcx r13, rbx + adcx r14, rcx + adcx r15, rdx + adcx rdi, r10 + ; A[0] * A[0] + mov rdx, rsi + mulx rcx, r10, rdx + xor rsi, rsi + adcx r11, r11 + ; A[1] * A[1] + mov rdx, rax + adox r11, rcx + mulx rbx, rcx, rdx + adcx r12, r12 + adox r12, rcx + adcx r13, r13 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+48] + adox r13, rbx + mulx rcx, rbx, rdx + adcx r14, r14 + adox r14, rbx + adcx r15, r15 + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+56] + adox r15, rcx + mulx rbx, rcx, rdx + adcx rdi, rdi + adox rdi, rcx + adcx rsi, rsi + adox rsi, rbx + mov rdx, 38 + mulx rbx, rsi, rsi + add r13, rsi + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r13, 1 + imul rbx, rbx, 19 + and r13, rcx + xor rcx, rcx + adox r10, rbx + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + adcx r13, rcx + ; Store + mov QWORD PTR [rsp+32], r10 + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + ; Square + mov rdx, QWORD PTR [rsp+64] + mov rax, QWORD PTR [rsp+72] + ; A[0] * A[1] + mov rsi, rdx + mulx r12, r11, rax + ; A[0] * A[3] + mulx r14, r13, QWORD PTR [rsp+88] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+80] + mulx rbx, rcx, rax + xor r10, r10 + adox r13, rcx + ; A[2] * A[3] + mulx rdi, r15, QWORD PTR [rsp+88] + adox r14, rbx + ; A[2] * A[0] + mulx rbx, rcx, rsi + adox r15, r10 + adcx r12, rcx + adox rdi, r10 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+88] + adcx r13, rbx + adcx r14, rcx + adcx r15, rdx + adcx rdi, r10 + ; A[0] * A[0] + mov rdx, rsi + mulx rcx, r10, rdx + xor rsi, rsi + adcx r11, r11 + ; A[1] * A[1] + mov rdx, rax + adox r11, rcx + mulx rbx, rcx, rdx + adcx r12, r12 + adox r12, rcx + adcx r13, r13 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+80] + adox r13, rbx + mulx rcx, rbx, rdx + adcx r14, r14 + adox r14, rbx + adcx r15, r15 + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+88] + adox r15, rcx + mulx rbx, rcx, rdx + adcx rdi, rdi + adox rdi, rcx + adcx rsi, rsi + adox rsi, rbx + mov rdx, 38 + mulx rbx, rsi, rsi + add r13, rsi + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r13, 1 + imul rbx, rbx, 19 + and r13, rcx + xor rcx, rcx + adox r10, rbx + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + adcx r13, rcx + ; Store + mov QWORD PTR [rsp+64], r10 + mov QWORD PTR [rsp+72], r11 + mov QWORD PTR [rsp+80], r12 + mov QWORD PTR [rsp+88], r13 + mov rdx, 121666 + mulx rsi, r10, QWORD PTR [rsp+128] + mulx rdi, r11, QWORD PTR [rsp+136] + mulx r15, r12, QWORD PTR [rsp+144] + add r11, rsi + mulx r14, r13, QWORD PTR [rsp+152] + adc r12, rdi + adc r13, r15 + adc r14, 0 + add r10, QWORD PTR [rsp+96] + adc r11, QWORD PTR [rsp+104] + adc r12, QWORD PTR [rsp+112] + adc r13, QWORD PTR [rsp+120] + adc r14, 0 + shld r14, r13, 1 + btr r13, 63 + imul r14, r14, 19 + add r10, r14 + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov QWORD PTR [rsp+96], r10 + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + mov rdx, 9 + mulx rsi, r10, QWORD PTR [rsp+32] + mulx rdi, r11, QWORD PTR [rsp+40] + mulx r15, r12, QWORD PTR [rsp+48] + add r11, rsi + mulx r14, r13, QWORD PTR [rsp+56] + adc r12, rdi + adc r13, r15 + adc r14, 0 + shld r14, r13, 1 + btr r13, 63 + imul r14, r14, 19 + add r10, r14 + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov QWORD PTR [rsp+32], r10 + mov QWORD PTR [rsp+40], r11 + mov QWORD PTR [rsp+48], r12 + mov QWORD PTR [rsp+56], r13 + mov rax, QWORD PTR [rsp+128] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+96] + mulx r11, r10, rax + ; A[2] * B[0] + mulx r13, r12, QWORD PTR [rsp+144] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+136] + xor rsi, rsi + adcx r11, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx r15, r14, QWORD PTR [rsp+152] + adcx r12, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r11, rcx + ; A[2] * B[1] + mulx rdi, rcx, QWORD PTR [rsp+144] + adox r12, rbx + adcx r13, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rdi + adox r13, rcx + adcx r15, rsi + adox r14, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox r15, rsi + xor rdi, rdi + adcx r12, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rcx, rdx, QWORD PTR [rsp+136] + adcx r13, rbx + adox r12, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r13, rcx + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx r15, rbx + adox r14, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r15, rcx + mulx rbx, rcx, QWORD PTR [rsp+152] + adox rdi, rsi + adcx rdi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rsi, rbx + xor rbx, rbx + adcx r13, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+152] + adcx r14, rcx + mulx rcx, rdx, QWORD PTR [rsp+96] + adox r13, rdx + adox r14, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+152] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx r15, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+120] + adcx rdi, rcx + mulx rdx, rcx, QWORD PTR [rsp+144] + adcx rsi, rbx + adox r15, rcx + adox rdi, rdx + adox rsi, rbx + mov rdx, 38 + mulx rcx, rsi, rsi + add r13, rsi + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r13, 1 + imul rcx, rcx, 19 + and r13, rbx + xor rbx, rbx + adox r10, rcx + mulx r14, rcx, r14 + adcx r10, rcx + adox r11, r14 + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + adcx r13, rbx + ; Store + mov QWORD PTR [rsp], r10 + mov QWORD PTR [rsp+8], r11 + mov QWORD PTR [rsp+16], r12 + mov QWORD PTR [rsp+24], r13 + dec rbp + cmp rbp, 3 + jge L_curve25519_base_avx2_bits + mov rax, QWORD PTR [rsp+168] + neg rax + ; Conditional Swap + mov r10, QWORD PTR [r8] + mov r11, QWORD PTR [r8+8] + mov r12, QWORD PTR [r8+16] + mov r13, QWORD PTR [r8+24] + mov r14, QWORD PTR [rsp] + mov r15, QWORD PTR [rsp+8] + mov rdi, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + xor r10, QWORD PTR [rsp+64] + xor r11, QWORD PTR [rsp+72] + xor r12, QWORD PTR [rsp+80] + xor r13, QWORD PTR [rsp+88] + xor r14, QWORD PTR [rsp+32] + xor r15, QWORD PTR [rsp+40] + xor rdi, QWORD PTR [rsp+48] + xor rsi, QWORD PTR [rsp+56] + and r10, rax + and r11, rax + and r12, rax + and r13, rax + and r14, rax + and r15, rax + and rdi, rax + and rsi, rax + xor QWORD PTR [r8], r10 + xor QWORD PTR [r8+8], r11 + xor QWORD PTR [r8+16], r12 + xor QWORD PTR [r8+24], r13 + xor QWORD PTR [rsp], r14 + xor QWORD PTR [rsp+8], r15 + xor QWORD PTR [rsp+16], rdi + xor QWORD PTR [rsp+24], rsi + xor QWORD PTR [rsp+64], r10 + xor QWORD PTR [rsp+72], r11 + xor QWORD PTR [rsp+80], r12 + xor QWORD PTR [rsp+88], r13 + xor QWORD PTR [rsp+32], r14 + xor QWORD PTR [rsp+40], r15 + xor QWORD PTR [rsp+48], rdi + xor QWORD PTR [rsp+56], rsi +L_curve25519_base_avx2_last_3: + ; Add-Sub + ; Add + mov r10, QWORD PTR [r8] + mov r11, QWORD PTR [r8+8] + mov r12, QWORD PTR [r8+16] + mov r13, QWORD PTR [r8+24] + mov r14, r10 + add r10, QWORD PTR [rsp] + mov r15, r11 + adc r11, QWORD PTR [rsp+8] + mov rdi, r12 + adc r12, QWORD PTR [rsp+16] + mov rsi, r13 + adc r13, QWORD PTR [rsp+24] + mov rbx, 0 + adc rbx, 0 + shld rbx, r13, 1 + imul rbx, 19 + btr r13, 63 + ; Sub modulus (if overflow) + add r10, rbx + adc r11, 0 + adc r12, 0 + adc r13, 0 + ; Sub + sub r14, QWORD PTR [rsp] + sbb r15, QWORD PTR [rsp+8] + sbb rdi, QWORD PTR [rsp+16] + sbb rsi, QWORD PTR [rsp+24] + sbb rbx, rbx + shld rbx, rsi, 1 + imul rbx, -19 + btr rsi, 63 + ; Add modulus (if underflow) + sub r14, rbx + sbb r15, 0 + sbb rdi, 0 + sbb rsi, 0 + mov QWORD PTR [r8], r10 + mov QWORD PTR [r8+8], r11 + mov QWORD PTR [r8+16], r12 + mov QWORD PTR [r8+24], r13 + mov QWORD PTR [rsp+128], r14 + mov QWORD PTR [rsp+136], r15 + mov QWORD PTR [rsp+144], rdi + mov QWORD PTR [rsp+152], rsi + ; Square + mov rdx, QWORD PTR [rsp+128] + mov rax, QWORD PTR [rsp+136] + ; A[0] * A[1] + mov rsi, rdx + mulx r12, r11, rax + ; A[0] * A[3] + mulx r14, r13, QWORD PTR [rsp+152] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, rax + xor r10, r10 + adox r13, rcx + ; A[2] * A[3] + mulx rdi, r15, QWORD PTR [rsp+152] + adox r14, rbx + ; A[2] * A[0] + mulx rbx, rcx, rsi + adox r15, r10 + adcx r12, rcx + adox rdi, r10 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+152] + adcx r13, rbx + adcx r14, rcx + adcx r15, rdx + adcx rdi, r10 + ; A[0] * A[0] + mov rdx, rsi + mulx rcx, r10, rdx + xor rsi, rsi + adcx r11, r11 + ; A[1] * A[1] + mov rdx, rax + adox r11, rcx + mulx rbx, rcx, rdx + adcx r12, r12 + adox r12, rcx + adcx r13, r13 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+144] + adox r13, rbx + mulx rcx, rbx, rdx + adcx r14, r14 + adox r14, rbx + adcx r15, r15 + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+152] + adox r15, rcx + mulx rbx, rcx, rdx + adcx rdi, rdi + adox rdi, rcx + adcx rsi, rsi + adox rsi, rbx + mov rdx, 38 + mulx rbx, rsi, rsi + add r13, rsi + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r13, 1 + imul rbx, rbx, 19 + and r13, rcx + xor rcx, rcx + adox r10, rbx + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + adcx r13, rcx + ; Store + mov QWORD PTR [rsp+96], r10 + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + ; Square + mov rdx, QWORD PTR [r8] + mov rax, QWORD PTR [r8+8] + ; A[0] * A[1] + mov rsi, rdx + mulx r12, r11, rax + ; A[0] * A[3] + mulx r14, r13, QWORD PTR [r8+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [r8+16] + mulx rbx, rcx, rax + xor r10, r10 + adox r13, rcx + ; A[2] * A[3] + mulx rdi, r15, QWORD PTR [r8+24] + adox r14, rbx + ; A[2] * A[0] + mulx rbx, rcx, rsi + adox r15, r10 + adcx r12, rcx + adox rdi, r10 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [r8+24] + adcx r13, rbx + adcx r14, rcx + adcx r15, rdx + adcx rdi, r10 + ; A[0] * A[0] + mov rdx, rsi + mulx rcx, r10, rdx + xor rsi, rsi + adcx r11, r11 + ; A[1] * A[1] + mov rdx, rax + adox r11, rcx + mulx rbx, rcx, rdx + adcx r12, r12 + adox r12, rcx + adcx r13, r13 + ; A[2] * A[2] + mov rdx, QWORD PTR [r8+16] + adox r13, rbx + mulx rcx, rbx, rdx + adcx r14, r14 + adox r14, rbx + adcx r15, r15 + ; A[3] * A[3] + mov rdx, QWORD PTR [r8+24] + adox r15, rcx + mulx rbx, rcx, rdx + adcx rdi, rdi + adox rdi, rcx + adcx rsi, rsi + adox rsi, rbx + mov rdx, 38 + mulx rbx, rsi, rsi + add r13, rsi + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r13, 1 + imul rbx, rbx, 19 + and r13, rcx + xor rcx, rcx + adox r10, rbx + mulx r14, rbx, r14 + adcx r10, rbx + adox r11, r14 + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + adcx r13, rcx + ; Store + mov QWORD PTR [rsp+128], r10 + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + mov rax, QWORD PTR [rsp+128] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+96] + mulx r11, r10, rax + ; A[2] * B[0] + mulx r13, r12, QWORD PTR [rsp+144] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+136] + xor rsi, rsi + adcx r11, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx r15, r14, QWORD PTR [rsp+152] + adcx r12, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r11, rcx + ; A[2] * B[1] + mulx rdi, rcx, QWORD PTR [rsp+144] + adox r12, rbx + adcx r13, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rdi + adox r13, rcx + adcx r15, rsi + adox r14, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox r15, rsi + xor rdi, rdi + adcx r12, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rcx, rdx, QWORD PTR [rsp+136] + adcx r13, rbx + adox r12, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r13, rcx + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx r15, rbx + adox r14, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r15, rcx + mulx rbx, rcx, QWORD PTR [rsp+152] + adox rdi, rsi + adcx rdi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rsi, rbx + xor rbx, rbx + adcx r13, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+152] + adcx r14, rcx + mulx rcx, rdx, QWORD PTR [rsp+96] + adox r13, rdx + adox r14, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+152] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx r15, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+120] + adcx rdi, rcx + mulx rdx, rcx, QWORD PTR [rsp+144] + adcx rsi, rbx + adox r15, rcx + adox rdi, rdx + adox rsi, rbx + mov rdx, 38 + mulx rcx, rsi, rsi + add r13, rsi + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r13, 1 + imul rcx, rcx, 19 + and r13, rbx + xor rbx, rbx + adox r10, rcx + mulx r14, rcx, r14 + adcx r10, rcx + adox r11, r14 + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + adcx r13, rbx + ; Store + mov QWORD PTR [r8], r10 + mov QWORD PTR [r8+8], r11 + mov QWORD PTR [r8+16], r12 + mov QWORD PTR [r8+24], r13 + ; Sub + mov r10, QWORD PTR [rsp+128] + mov r11, QWORD PTR [rsp+136] + mov r12, QWORD PTR [rsp+144] + mov r13, QWORD PTR [rsp+152] + sub r10, QWORD PTR [rsp+96] + sbb r11, QWORD PTR [rsp+104] + sbb r12, QWORD PTR [rsp+112] + sbb r13, QWORD PTR [rsp+120] + sbb rbx, rbx + shld rbx, r13, 1 + imul rbx, -19 + btr r13, 63 + ; Add modulus (if underflow) + sub r10, rbx + sbb r11, 0 + sbb r12, 0 + sbb r13, 0 + mov QWORD PTR [rsp+128], r10 + mov QWORD PTR [rsp+136], r11 + mov QWORD PTR [rsp+144], r12 + mov QWORD PTR [rsp+152], r13 + mov rdx, 121666 + mulx rsi, r10, QWORD PTR [rsp+128] + mulx rdi, r11, QWORD PTR [rsp+136] + mulx r15, r12, QWORD PTR [rsp+144] + add r11, rsi + mulx r14, r13, QWORD PTR [rsp+152] + adc r12, rdi + adc r13, r15 + adc r14, 0 + add r10, QWORD PTR [rsp+96] + adc r11, QWORD PTR [rsp+104] + adc r12, QWORD PTR [rsp+112] + adc r13, QWORD PTR [rsp+120] + adc r14, 0 + shld r14, r13, 1 + btr r13, 63 + imul r14, r14, 19 + add r10, r14 + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov QWORD PTR [rsp+96], r10 + mov QWORD PTR [rsp+104], r11 + mov QWORD PTR [rsp+112], r12 + mov QWORD PTR [rsp+120], r13 + mov rax, QWORD PTR [rsp+128] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+96] + mulx r11, r10, rax + ; A[2] * B[0] + mulx r13, r12, QWORD PTR [rsp+144] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+136] + xor rsi, rsi + adcx r11, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx r15, r14, QWORD PTR [rsp+152] + adcx r12, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r11, rcx + ; A[2] * B[1] + mulx rdi, rcx, QWORD PTR [rsp+144] + adox r12, rbx + adcx r13, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rdi + adox r13, rcx + adcx r15, rsi + adox r14, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox r15, rsi + xor rdi, rdi + adcx r12, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rcx, rdx, QWORD PTR [rsp+136] + adcx r13, rbx + adox r12, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r13, rcx + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r14, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx r15, rbx + adox r14, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r15, rcx + mulx rbx, rcx, QWORD PTR [rsp+152] + adox rdi, rsi + adcx rdi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rsi, rbx + xor rbx, rbx + adcx r13, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+152] + adcx r14, rcx + mulx rcx, rdx, QWORD PTR [rsp+96] + adox r13, rdx + adox r14, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+152] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx r15, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+120] + adcx rdi, rcx + mulx rdx, rcx, QWORD PTR [rsp+144] + adcx rsi, rbx + adox r15, rcx + adox rdi, rdx + adox rsi, rbx + mov rdx, 38 + mulx rcx, rsi, rsi + add r13, rsi + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r13, 1 + imul rcx, rcx, 19 + and r13, rbx + xor rbx, rbx + adox r10, rcx + mulx r14, rcx, r14 + adcx r10, rcx + adox r11, r14 + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + adcx r13, rbx + ; Store + mov QWORD PTR [rsp], r10 + mov QWORD PTR [rsp+8], r11 + mov QWORD PTR [rsp+16], r12 + mov QWORD PTR [rsp+24], r13 + dec rbp + jge L_curve25519_base_avx2_last_3 + ; Invert + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + mov rdx, rsp + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+96] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 4 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 19 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 99 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 4 + call fe_sq_n_avx2 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + mov r8, QWORD PTR [rsp+160] + mov rax, QWORD PTR [r8] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp] + mulx r11, r10, rax + ; A[2] * B[0] + mulx r13, r12, QWORD PTR [r8+16] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [r8+8] + xor rsi, rsi + adcx r11, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+8] + mulx r15, r14, QWORD PTR [r8+24] + adcx r12, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r11, rcx + ; A[2] * B[1] + mulx rdi, rcx, QWORD PTR [r8+16] + adox r12, rbx + adcx r13, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+16] + mulx rbx, rcx, QWORD PTR [r8+8] + adcx r14, rdi + adox r13, rcx + adcx r15, rsi + adox r14, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox r15, rsi + xor rdi, rdi + adcx r12, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+8] + mulx rcx, rdx, QWORD PTR [r8+8] + adcx r13, rbx + adox r12, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+24] + adox r13, rcx + mulx rbx, rcx, QWORD PTR [r8+8] + adcx r14, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+16] + mulx rcx, rdx, QWORD PTR [r8+16] + adcx r15, rbx + adox r14, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+24] + adox r15, rcx + mulx rbx, rcx, QWORD PTR [r8+24] + adox rdi, rsi + adcx rdi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rsi, rbx + xor rbx, rbx + adcx r13, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r8+24] + adcx r14, rcx + mulx rcx, rdx, QWORD PTR [rsp] + adox r13, rdx + adox r14, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [r8+24] + mulx rcx, rdx, QWORD PTR [rsp+16] + adcx r15, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+24] + adcx rdi, rcx + mulx rdx, rcx, QWORD PTR [r8+16] + adcx rsi, rbx + adox r15, rcx + adox rdi, rdx + adox rsi, rbx + mov rdx, 38 + mulx rcx, rsi, rsi + add r13, rsi + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r13, 1 + imul rcx, rcx, 19 + and r13, rbx + xor rbx, rbx + adox r10, rcx + mulx r14, rcx, r14 + adcx r10, rcx + adox r11, r14 + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + adcx r13, rbx + mov rbx, 9223372036854775807 + mov rdx, r13 + sar rdx, 63 + and rdx, 19 + and r13, rbx + add r10, rdx + adc r11, 0 + adc r12, 0 + adc r13, 0 + mov rcx, 9223372036854775807 + mov rdx, r10 + add rdx, 19 + mov rdx, r11 + adc rdx, 0 + mov rdx, r12 + adc rdx, 0 + mov rdx, r13 + adc rdx, 0 + sar rdx, 63 + and rdx, 19 + and r13, rcx + add r10, rdx + adc r11, 0 + adc r12, 0 + adc r13, 0 + and r13, rcx + ; Store + mov QWORD PTR [r8], r10 + mov QWORD PTR [r8+8], r11 + mov QWORD PTR [r8+16], r12 + mov QWORD PTR [r8+24], r13 + xor rax, rax + add rsp, 176 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +curve25519_base_avx2 ENDP +_TEXT ENDS +ENDIF +_TEXT SEGMENT READONLY PARA +curve25519_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov r9, rcx + mov r10, rdx + sub rsp, 184 + mov QWORD PTR [rsp+176], 0 + mov QWORD PTR [rsp+168], r9 + ; Set one + mov QWORD PTR [r9], 1 + mov QWORD PTR [r9+8], 0 + mov QWORD PTR [r9+16], 0 + mov QWORD PTR [r9+24], 0 + ; Set zero + mov QWORD PTR [rsp], 0 + mov QWORD PTR [rsp+8], 0 + mov QWORD PTR [rsp+16], 0 + mov QWORD PTR [rsp+24], 0 + ; Set one + mov QWORD PTR [rsp+32], 1 + mov QWORD PTR [rsp+40], 0 + mov QWORD PTR [rsp+48], 0 + mov QWORD PTR [rsp+56], 0 + ; Copy + mov r11, QWORD PTR [r8] + mov r12, QWORD PTR [r8+8] + mov r13, QWORD PTR [r8+16] + mov r14, QWORD PTR [r8+24] + mov QWORD PTR [rsp+64], r11 + mov QWORD PTR [rsp+72], r12 + mov QWORD PTR [rsp+80], r13 + mov QWORD PTR [rsp+88], r14 + mov rbx, 254 +L_curve25519_avx2_bits: + mov QWORD PTR [rsp+160], rbx + mov rcx, rbx + mov rax, QWORD PTR [rsp+176] + and rcx, 63 + shr rbx, 6 + mov rbx, QWORD PTR [r10+8*rbx] + shr rbx, cl + and rbx, 1 + xor rax, rbx + mov QWORD PTR [rsp+176], rbx + neg rax + ; Conditional Swap + mov r11, QWORD PTR [r9] + mov r12, QWORD PTR [r9+8] + mov r13, QWORD PTR [r9+16] + mov r14, QWORD PTR [r9+24] + mov r15, QWORD PTR [rsp] + mov rdi, QWORD PTR [rsp+8] + mov rsi, QWORD PTR [rsp+16] + mov rbp, QWORD PTR [rsp+24] + xor r11, QWORD PTR [rsp+64] + xor r12, QWORD PTR [rsp+72] + xor r13, QWORD PTR [rsp+80] + xor r14, QWORD PTR [rsp+88] + xor r15, QWORD PTR [rsp+32] + xor rdi, QWORD PTR [rsp+40] + xor rsi, QWORD PTR [rsp+48] + xor rbp, QWORD PTR [rsp+56] + and r11, rax + and r12, rax + and r13, rax + and r14, rax + and r15, rax + and rdi, rax + and rsi, rax + and rbp, rax + xor QWORD PTR [r9], r11 + xor QWORD PTR [r9+8], r12 + xor QWORD PTR [r9+16], r13 + xor QWORD PTR [r9+24], r14 + xor QWORD PTR [rsp], r15 + xor QWORD PTR [rsp+8], rdi + xor QWORD PTR [rsp+16], rsi + xor QWORD PTR [rsp+24], rbp + xor QWORD PTR [rsp+64], r11 + xor QWORD PTR [rsp+72], r12 + xor QWORD PTR [rsp+80], r13 + xor QWORD PTR [rsp+88], r14 + xor QWORD PTR [rsp+32], r15 + xor QWORD PTR [rsp+40], rdi + xor QWORD PTR [rsp+48], rsi + xor QWORD PTR [rsp+56], rbp + ; Add-Sub + ; Add + mov r11, QWORD PTR [r9] + mov r12, QWORD PTR [r9+8] + mov r13, QWORD PTR [r9+16] + mov r14, QWORD PTR [r9+24] + mov r15, r11 + add r11, QWORD PTR [rsp] + mov rdi, r12 + adc r12, QWORD PTR [rsp+8] + mov rsi, r13 + adc r13, QWORD PTR [rsp+16] + mov rbp, r14 + adc r14, QWORD PTR [rsp+24] + mov rbx, 0 + adc rbx, 0 + shld rbx, r14, 1 + imul rbx, 19 + btr r14, 63 + ; Sub modulus (if overflow) + add r11, rbx + adc r12, 0 + adc r13, 0 + adc r14, 0 + ; Sub + sub r15, QWORD PTR [rsp] + sbb rdi, QWORD PTR [rsp+8] + sbb rsi, QWORD PTR [rsp+16] + sbb rbp, QWORD PTR [rsp+24] + sbb rbx, rbx + shld rbx, rbp, 1 + imul rbx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub r15, rbx + sbb rdi, 0 + sbb rsi, 0 + sbb rbp, 0 + mov QWORD PTR [r9], r11 + mov QWORD PTR [r9+8], r12 + mov QWORD PTR [r9+16], r13 + mov QWORD PTR [r9+24], r14 + mov QWORD PTR [rsp+128], r15 + mov QWORD PTR [rsp+136], rdi + mov QWORD PTR [rsp+144], rsi + mov QWORD PTR [rsp+152], rbp + ; Add-Sub + ; Add + mov r11, QWORD PTR [rsp+64] + mov r12, QWORD PTR [rsp+72] + mov r13, QWORD PTR [rsp+80] + mov r14, QWORD PTR [rsp+88] + mov r15, r11 + add r11, QWORD PTR [rsp+32] + mov rdi, r12 + adc r12, QWORD PTR [rsp+40] + mov rsi, r13 + adc r13, QWORD PTR [rsp+48] + mov rbp, r14 + adc r14, QWORD PTR [rsp+56] + mov rbx, 0 + adc rbx, 0 + shld rbx, r14, 1 + imul rbx, 19 + btr r14, 63 + ; Sub modulus (if overflow) + add r11, rbx + adc r12, 0 + adc r13, 0 + adc r14, 0 + ; Sub + sub r15, QWORD PTR [rsp+32] + sbb rdi, QWORD PTR [rsp+40] + sbb rsi, QWORD PTR [rsp+48] + sbb rbp, QWORD PTR [rsp+56] + sbb rbx, rbx + shld rbx, rbp, 1 + imul rbx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub r15, rbx + sbb rdi, 0 + sbb rsi, 0 + sbb rbp, 0 + mov QWORD PTR [rsp+32], r11 + mov QWORD PTR [rsp+40], r12 + mov QWORD PTR [rsp+48], r13 + mov QWORD PTR [rsp+56], r14 + mov QWORD PTR [rsp+96], r15 + mov QWORD PTR [rsp+104], rdi + mov QWORD PTR [rsp+112], rsi + mov QWORD PTR [rsp+120], rbp + mov rax, QWORD PTR [rsp+32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+128] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [rsp+48] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+40] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx rdi, r15, QWORD PTR [rsp+56] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [rsp+48] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, QWORD PTR [rsp+40] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx rcx, rdx, QWORD PTR [rsp+40] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [rsp+40] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rcx, rdx, QWORD PTR [rsp+48] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [rsp+56] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+56] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [rsp+128] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+56] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+152] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [rsp+48] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + ; Store + mov QWORD PTR [rsp+32], r11 + mov QWORD PTR [rsp+40], r12 + mov QWORD PTR [rsp+48], r13 + mov QWORD PTR [rsp+56], r14 + mov rax, QWORD PTR [rsp+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r9] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [rsp+112] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+104] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [r9+8] + mulx rdi, r15, QWORD PTR [rsp+120] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [rsp+112] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [r9+16] + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [r9+8] + mulx rcx, rdx, QWORD PTR [rsp+104] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r9+24] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [r9+16] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r9+24] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [rsp+120] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+120] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [r9] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+120] + mulx rcx, rdx, QWORD PTR [r9+16] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r9+24] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [rsp+112] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + ; Store + mov QWORD PTR [rsp], r11 + mov QWORD PTR [rsp+8], r12 + mov QWORD PTR [rsp+16], r13 + mov QWORD PTR [rsp+24], r14 + ; Square + mov rdx, QWORD PTR [rsp+128] + mov rax, QWORD PTR [rsp+136] + ; A[0] * A[1] + mov rbp, rdx + mulx r13, r12, rax + ; A[0] * A[3] + mulx r15, r14, QWORD PTR [rsp+152] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, rax + xor r11, r11 + adox r14, rcx + ; A[2] * A[3] + mulx rsi, rdi, QWORD PTR [rsp+152] + adox r15, rbx + ; A[2] * A[0] + mulx rbx, rcx, rbp + adox rdi, r11 + adcx r13, rcx + adox rsi, r11 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+152] + adcx r14, rbx + adcx r15, rcx + adcx rdi, rdx + adcx rsi, r11 + ; A[0] * A[0] + mov rdx, rbp + mulx rcx, r11, rdx + xor rbp, rbp + adcx r12, r12 + ; A[1] * A[1] + mov rdx, rax + adox r12, rcx + mulx rbx, rcx, rdx + adcx r13, r13 + adox r13, rcx + adcx r14, r14 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+144] + adox r14, rbx + mulx rcx, rbx, rdx + adcx r15, r15 + adox r15, rbx + adcx rdi, rdi + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+152] + adox rdi, rcx + mulx rbx, rcx, rdx + adcx rsi, rsi + adox rsi, rcx + adcx rbp, rbp + adox rbp, rbx + mov rdx, 38 + mulx rbx, rbp, rbp + add r14, rbp + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r14, 1 + imul rbx, rbx, 19 + and r14, rcx + xor rcx, rcx + adox r11, rbx + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + mulx rsi, rbx, rsi + adcx r13, rbx + adox r14, rsi + adcx r14, rcx + ; Store + mov QWORD PTR [rsp+96], r11 + mov QWORD PTR [rsp+104], r12 + mov QWORD PTR [rsp+112], r13 + mov QWORD PTR [rsp+120], r14 + ; Square + mov rdx, QWORD PTR [r9] + mov rax, QWORD PTR [r9+8] + ; A[0] * A[1] + mov rbp, rdx + mulx r13, r12, rax + ; A[0] * A[3] + mulx r15, r14, QWORD PTR [r9+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [r9+16] + mulx rbx, rcx, rax + xor r11, r11 + adox r14, rcx + ; A[2] * A[3] + mulx rsi, rdi, QWORD PTR [r9+24] + adox r15, rbx + ; A[2] * A[0] + mulx rbx, rcx, rbp + adox rdi, r11 + adcx r13, rcx + adox rsi, r11 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [r9+24] + adcx r14, rbx + adcx r15, rcx + adcx rdi, rdx + adcx rsi, r11 + ; A[0] * A[0] + mov rdx, rbp + mulx rcx, r11, rdx + xor rbp, rbp + adcx r12, r12 + ; A[1] * A[1] + mov rdx, rax + adox r12, rcx + mulx rbx, rcx, rdx + adcx r13, r13 + adox r13, rcx + adcx r14, r14 + ; A[2] * A[2] + mov rdx, QWORD PTR [r9+16] + adox r14, rbx + mulx rcx, rbx, rdx + adcx r15, r15 + adox r15, rbx + adcx rdi, rdi + ; A[3] * A[3] + mov rdx, QWORD PTR [r9+24] + adox rdi, rcx + mulx rbx, rcx, rdx + adcx rsi, rsi + adox rsi, rcx + adcx rbp, rbp + adox rbp, rbx + mov rdx, 38 + mulx rbx, rbp, rbp + add r14, rbp + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r14, 1 + imul rbx, rbx, 19 + and r14, rcx + xor rcx, rcx + adox r11, rbx + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + mulx rsi, rbx, rsi + adcx r13, rbx + adox r14, rsi + adcx r14, rcx + ; Store + mov QWORD PTR [rsp+128], r11 + mov QWORD PTR [rsp+136], r12 + mov QWORD PTR [rsp+144], r13 + mov QWORD PTR [rsp+152], r14 + ; Add-Sub + ; Add + mov r11, QWORD PTR [rsp] + mov r12, QWORD PTR [rsp+8] + mov r13, QWORD PTR [rsp+16] + mov r14, QWORD PTR [rsp+24] + mov r15, r11 + add r11, QWORD PTR [rsp+32] + mov rdi, r12 + adc r12, QWORD PTR [rsp+40] + mov rsi, r13 + adc r13, QWORD PTR [rsp+48] + mov rbp, r14 + adc r14, QWORD PTR [rsp+56] + mov rbx, 0 + adc rbx, 0 + shld rbx, r14, 1 + imul rbx, 19 + btr r14, 63 + ; Sub modulus (if overflow) + add r11, rbx + adc r12, 0 + adc r13, 0 + adc r14, 0 + ; Sub + sub r15, QWORD PTR [rsp+32] + sbb rdi, QWORD PTR [rsp+40] + sbb rsi, QWORD PTR [rsp+48] + sbb rbp, QWORD PTR [rsp+56] + sbb rbx, rbx + shld rbx, rbp, 1 + imul rbx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub r15, rbx + sbb rdi, 0 + sbb rsi, 0 + sbb rbp, 0 + mov QWORD PTR [rsp+64], r11 + mov QWORD PTR [rsp+72], r12 + mov QWORD PTR [rsp+80], r13 + mov QWORD PTR [rsp+88], r14 + mov QWORD PTR [rsp+32], r15 + mov QWORD PTR [rsp+40], rdi + mov QWORD PTR [rsp+48], rsi + mov QWORD PTR [rsp+56], rbp + mov rax, QWORD PTR [rsp+128] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+96] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [rsp+144] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+136] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rdi, r15, QWORD PTR [rsp+152] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [rsp+144] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rcx, rdx, QWORD PTR [rsp+136] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [rsp+152] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+152] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [rsp+96] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+152] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+120] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [rsp+144] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + ; Store + mov QWORD PTR [r9], r11 + mov QWORD PTR [r9+8], r12 + mov QWORD PTR [r9+16], r13 + mov QWORD PTR [r9+24], r14 + ; Sub + mov r11, QWORD PTR [rsp+128] + mov r12, QWORD PTR [rsp+136] + mov r13, QWORD PTR [rsp+144] + mov r14, QWORD PTR [rsp+152] + sub r11, QWORD PTR [rsp+96] + sbb r12, QWORD PTR [rsp+104] + sbb r13, QWORD PTR [rsp+112] + sbb r14, QWORD PTR [rsp+120] + sbb rbx, rbx + shld rbx, r14, 1 + imul rbx, -19 + btr r14, 63 + ; Add modulus (if underflow) + sub r11, rbx + sbb r12, 0 + sbb r13, 0 + sbb r14, 0 + mov QWORD PTR [rsp+128], r11 + mov QWORD PTR [rsp+136], r12 + mov QWORD PTR [rsp+144], r13 + mov QWORD PTR [rsp+152], r14 + ; Square + mov rdx, QWORD PTR [rsp+32] + mov rax, QWORD PTR [rsp+40] + ; A[0] * A[1] + mov rbp, rdx + mulx r13, r12, rax + ; A[0] * A[3] + mulx r15, r14, QWORD PTR [rsp+56] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+48] + mulx rbx, rcx, rax + xor r11, r11 + adox r14, rcx + ; A[2] * A[3] + mulx rsi, rdi, QWORD PTR [rsp+56] + adox r15, rbx + ; A[2] * A[0] + mulx rbx, rcx, rbp + adox rdi, r11 + adcx r13, rcx + adox rsi, r11 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+56] + adcx r14, rbx + adcx r15, rcx + adcx rdi, rdx + adcx rsi, r11 + ; A[0] * A[0] + mov rdx, rbp + mulx rcx, r11, rdx + xor rbp, rbp + adcx r12, r12 + ; A[1] * A[1] + mov rdx, rax + adox r12, rcx + mulx rbx, rcx, rdx + adcx r13, r13 + adox r13, rcx + adcx r14, r14 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+48] + adox r14, rbx + mulx rcx, rbx, rdx + adcx r15, r15 + adox r15, rbx + adcx rdi, rdi + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+56] + adox rdi, rcx + mulx rbx, rcx, rdx + adcx rsi, rsi + adox rsi, rcx + adcx rbp, rbp + adox rbp, rbx + mov rdx, 38 + mulx rbx, rbp, rbp + add r14, rbp + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r14, 1 + imul rbx, rbx, 19 + and r14, rcx + xor rcx, rcx + adox r11, rbx + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + mulx rsi, rbx, rsi + adcx r13, rbx + adox r14, rsi + adcx r14, rcx + ; Store + mov QWORD PTR [rsp+32], r11 + mov QWORD PTR [rsp+40], r12 + mov QWORD PTR [rsp+48], r13 + mov QWORD PTR [rsp+56], r14 + ; Square + mov rdx, QWORD PTR [rsp+64] + mov rax, QWORD PTR [rsp+72] + ; A[0] * A[1] + mov rbp, rdx + mulx r13, r12, rax + ; A[0] * A[3] + mulx r15, r14, QWORD PTR [rsp+88] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+80] + mulx rbx, rcx, rax + xor r11, r11 + adox r14, rcx + ; A[2] * A[3] + mulx rsi, rdi, QWORD PTR [rsp+88] + adox r15, rbx + ; A[2] * A[0] + mulx rbx, rcx, rbp + adox rdi, r11 + adcx r13, rcx + adox rsi, r11 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+88] + adcx r14, rbx + adcx r15, rcx + adcx rdi, rdx + adcx rsi, r11 + ; A[0] * A[0] + mov rdx, rbp + mulx rcx, r11, rdx + xor rbp, rbp + adcx r12, r12 + ; A[1] * A[1] + mov rdx, rax + adox r12, rcx + mulx rbx, rcx, rdx + adcx r13, r13 + adox r13, rcx + adcx r14, r14 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+80] + adox r14, rbx + mulx rcx, rbx, rdx + adcx r15, r15 + adox r15, rbx + adcx rdi, rdi + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+88] + adox rdi, rcx + mulx rbx, rcx, rdx + adcx rsi, rsi + adox rsi, rcx + adcx rbp, rbp + adox rbp, rbx + mov rdx, 38 + mulx rbx, rbp, rbp + add r14, rbp + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r14, 1 + imul rbx, rbx, 19 + and r14, rcx + xor rcx, rcx + adox r11, rbx + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + mulx rsi, rbx, rsi + adcx r13, rbx + adox r14, rsi + adcx r14, rcx + ; Store + mov QWORD PTR [rsp+64], r11 + mov QWORD PTR [rsp+72], r12 + mov QWORD PTR [rsp+80], r13 + mov QWORD PTR [rsp+88], r14 + mov rdx, 121666 + mulx rbp, r11, QWORD PTR [rsp+128] + mulx rsi, r12, QWORD PTR [rsp+136] + mulx rdi, r13, QWORD PTR [rsp+144] + add r12, rbp + mulx r15, r14, QWORD PTR [rsp+152] + adc r13, rsi + adc r14, rdi + adc r15, 0 + add r11, QWORD PTR [rsp+96] + adc r12, QWORD PTR [rsp+104] + adc r13, QWORD PTR [rsp+112] + adc r14, QWORD PTR [rsp+120] + adc r15, 0 + shld r15, r14, 1 + btr r14, 63 + imul r15, r15, 19 + add r11, r15 + adc r12, 0 + adc r13, 0 + adc r14, 0 + mov QWORD PTR [rsp+96], r11 + mov QWORD PTR [rsp+104], r12 + mov QWORD PTR [rsp+112], r13 + mov QWORD PTR [rsp+120], r14 + mov rax, QWORD PTR [r8] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+32] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [r8+16] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [r8+8] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+40] + mulx rdi, r15, QWORD PTR [r8+24] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [r8+16] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+48] + mulx rbx, rcx, QWORD PTR [r8+8] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+40] + mulx rcx, rdx, QWORD PTR [r8+8] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+56] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [r8+8] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+48] + mulx rcx, rdx, QWORD PTR [r8+16] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+56] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [r8+24] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r8+24] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [rsp+32] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [r8+24] + mulx rcx, rdx, QWORD PTR [rsp+48] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+56] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [r8+16] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + ; Store + mov QWORD PTR [rsp+32], r11 + mov QWORD PTR [rsp+40], r12 + mov QWORD PTR [rsp+48], r13 + mov QWORD PTR [rsp+56], r14 + mov rax, QWORD PTR [rsp+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+128] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [rsp+112] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+104] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx rdi, r15, QWORD PTR [rsp+120] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [rsp+112] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx rcx, rdx, QWORD PTR [rsp+104] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [rsp+120] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+120] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [rsp+128] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+120] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+152] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [rsp+112] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + ; Store + mov QWORD PTR [rsp], r11 + mov QWORD PTR [rsp+8], r12 + mov QWORD PTR [rsp+16], r13 + mov QWORD PTR [rsp+24], r14 + mov rbx, QWORD PTR [rsp+160] + dec rbx + cmp rbx, 3 + jge L_curve25519_avx2_bits + mov QWORD PTR [rsp+160], 2 + mov rax, QWORD PTR [rsp+176] + neg rax + ; Conditional Swap + mov r11, QWORD PTR [r9] + mov r12, QWORD PTR [r9+8] + mov r13, QWORD PTR [r9+16] + mov r14, QWORD PTR [r9+24] + mov r15, QWORD PTR [rsp] + mov rdi, QWORD PTR [rsp+8] + mov rsi, QWORD PTR [rsp+16] + mov rbp, QWORD PTR [rsp+24] + xor r11, QWORD PTR [rsp+64] + xor r12, QWORD PTR [rsp+72] + xor r13, QWORD PTR [rsp+80] + xor r14, QWORD PTR [rsp+88] + xor r15, QWORD PTR [rsp+32] + xor rdi, QWORD PTR [rsp+40] + xor rsi, QWORD PTR [rsp+48] + xor rbp, QWORD PTR [rsp+56] + and r11, rax + and r12, rax + and r13, rax + and r14, rax + and r15, rax + and rdi, rax + and rsi, rax + and rbp, rax + xor QWORD PTR [r9], r11 + xor QWORD PTR [r9+8], r12 + xor QWORD PTR [r9+16], r13 + xor QWORD PTR [r9+24], r14 + xor QWORD PTR [rsp], r15 + xor QWORD PTR [rsp+8], rdi + xor QWORD PTR [rsp+16], rsi + xor QWORD PTR [rsp+24], rbp + xor QWORD PTR [rsp+64], r11 + xor QWORD PTR [rsp+72], r12 + xor QWORD PTR [rsp+80], r13 + xor QWORD PTR [rsp+88], r14 + xor QWORD PTR [rsp+32], r15 + xor QWORD PTR [rsp+40], rdi + xor QWORD PTR [rsp+48], rsi + xor QWORD PTR [rsp+56], rbp +L_curve25519_avx2_last_3: + ; Add-Sub + ; Add + mov r11, QWORD PTR [r9] + mov r12, QWORD PTR [r9+8] + mov r13, QWORD PTR [r9+16] + mov r14, QWORD PTR [r9+24] + mov r15, r11 + add r11, QWORD PTR [rsp] + mov rdi, r12 + adc r12, QWORD PTR [rsp+8] + mov rsi, r13 + adc r13, QWORD PTR [rsp+16] + mov rbp, r14 + adc r14, QWORD PTR [rsp+24] + mov rbx, 0 + adc rbx, 0 + shld rbx, r14, 1 + imul rbx, 19 + btr r14, 63 + ; Sub modulus (if overflow) + add r11, rbx + adc r12, 0 + adc r13, 0 + adc r14, 0 + ; Sub + sub r15, QWORD PTR [rsp] + sbb rdi, QWORD PTR [rsp+8] + sbb rsi, QWORD PTR [rsp+16] + sbb rbp, QWORD PTR [rsp+24] + sbb rbx, rbx + shld rbx, rbp, 1 + imul rbx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub r15, rbx + sbb rdi, 0 + sbb rsi, 0 + sbb rbp, 0 + mov QWORD PTR [r9], r11 + mov QWORD PTR [r9+8], r12 + mov QWORD PTR [r9+16], r13 + mov QWORD PTR [r9+24], r14 + mov QWORD PTR [rsp+128], r15 + mov QWORD PTR [rsp+136], rdi + mov QWORD PTR [rsp+144], rsi + mov QWORD PTR [rsp+152], rbp + ; Square + mov rdx, QWORD PTR [rsp+128] + mov rax, QWORD PTR [rsp+136] + ; A[0] * A[1] + mov rbp, rdx + mulx r13, r12, rax + ; A[0] * A[3] + mulx r15, r14, QWORD PTR [rsp+152] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, rax + xor r11, r11 + adox r14, rcx + ; A[2] * A[3] + mulx rsi, rdi, QWORD PTR [rsp+152] + adox r15, rbx + ; A[2] * A[0] + mulx rbx, rcx, rbp + adox rdi, r11 + adcx r13, rcx + adox rsi, r11 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsp+152] + adcx r14, rbx + adcx r15, rcx + adcx rdi, rdx + adcx rsi, r11 + ; A[0] * A[0] + mov rdx, rbp + mulx rcx, r11, rdx + xor rbp, rbp + adcx r12, r12 + ; A[1] * A[1] + mov rdx, rax + adox r12, rcx + mulx rbx, rcx, rdx + adcx r13, r13 + adox r13, rcx + adcx r14, r14 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsp+144] + adox r14, rbx + mulx rcx, rbx, rdx + adcx r15, r15 + adox r15, rbx + adcx rdi, rdi + ; A[3] * A[3] + mov rdx, QWORD PTR [rsp+152] + adox rdi, rcx + mulx rbx, rcx, rdx + adcx rsi, rsi + adox rsi, rcx + adcx rbp, rbp + adox rbp, rbx + mov rdx, 38 + mulx rbx, rbp, rbp + add r14, rbp + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r14, 1 + imul rbx, rbx, 19 + and r14, rcx + xor rcx, rcx + adox r11, rbx + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + mulx rsi, rbx, rsi + adcx r13, rbx + adox r14, rsi + adcx r14, rcx + ; Store + mov QWORD PTR [rsp+96], r11 + mov QWORD PTR [rsp+104], r12 + mov QWORD PTR [rsp+112], r13 + mov QWORD PTR [rsp+120], r14 + ; Square + mov rdx, QWORD PTR [r9] + mov rax, QWORD PTR [r9+8] + ; A[0] * A[1] + mov rbp, rdx + mulx r13, r12, rax + ; A[0] * A[3] + mulx r15, r14, QWORD PTR [r9+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [r9+16] + mulx rbx, rcx, rax + xor r11, r11 + adox r14, rcx + ; A[2] * A[3] + mulx rsi, rdi, QWORD PTR [r9+24] + adox r15, rbx + ; A[2] * A[0] + mulx rbx, rcx, rbp + adox rdi, r11 + adcx r13, rcx + adox rsi, r11 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [r9+24] + adcx r14, rbx + adcx r15, rcx + adcx rdi, rdx + adcx rsi, r11 + ; A[0] * A[0] + mov rdx, rbp + mulx rcx, r11, rdx + xor rbp, rbp + adcx r12, r12 + ; A[1] * A[1] + mov rdx, rax + adox r12, rcx + mulx rbx, rcx, rdx + adcx r13, r13 + adox r13, rcx + adcx r14, r14 + ; A[2] * A[2] + mov rdx, QWORD PTR [r9+16] + adox r14, rbx + mulx rcx, rbx, rdx + adcx r15, r15 + adox r15, rbx + adcx rdi, rdi + ; A[3] * A[3] + mov rdx, QWORD PTR [r9+24] + adox rdi, rcx + mulx rbx, rcx, rdx + adcx rsi, rsi + adox rsi, rcx + adcx rbp, rbp + adox rbp, rbx + mov rdx, 38 + mulx rbx, rbp, rbp + add r14, rbp + adc rbx, 0 + mov rcx, 9223372036854775807 + shld rbx, r14, 1 + imul rbx, rbx, 19 + and r14, rcx + xor rcx, rcx + adox r11, rbx + mulx r15, rbx, r15 + adcx r11, rbx + adox r12, r15 + mulx rdi, rbx, rdi + adcx r12, rbx + adox r13, rdi + mulx rsi, rbx, rsi + adcx r13, rbx + adox r14, rsi + adcx r14, rcx + ; Store + mov QWORD PTR [rsp+128], r11 + mov QWORD PTR [rsp+136], r12 + mov QWORD PTR [rsp+144], r13 + mov QWORD PTR [rsp+152], r14 + mov rax, QWORD PTR [rsp+128] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+96] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [rsp+144] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+136] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rdi, r15, QWORD PTR [rsp+152] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [rsp+144] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+104] + mulx rcx, rdx, QWORD PTR [rsp+136] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [rsp+136] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+112] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+120] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [rsp+152] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+152] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [rsp+96] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+152] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+120] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [rsp+144] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + ; Store + mov QWORD PTR [r9], r11 + mov QWORD PTR [r9+8], r12 + mov QWORD PTR [r9+16], r13 + mov QWORD PTR [r9+24], r14 + ; Sub + mov r11, QWORD PTR [rsp+128] + mov r12, QWORD PTR [rsp+136] + mov r13, QWORD PTR [rsp+144] + mov r14, QWORD PTR [rsp+152] + sub r11, QWORD PTR [rsp+96] + sbb r12, QWORD PTR [rsp+104] + sbb r13, QWORD PTR [rsp+112] + sbb r14, QWORD PTR [rsp+120] + sbb rbx, rbx + shld rbx, r14, 1 + imul rbx, -19 + btr r14, 63 + ; Add modulus (if underflow) + sub r11, rbx + sbb r12, 0 + sbb r13, 0 + sbb r14, 0 + mov QWORD PTR [rsp+128], r11 + mov QWORD PTR [rsp+136], r12 + mov QWORD PTR [rsp+144], r13 + mov QWORD PTR [rsp+152], r14 + mov rdx, 121666 + mulx rbp, r11, QWORD PTR [rsp+128] + mulx rsi, r12, QWORD PTR [rsp+136] + mulx rdi, r13, QWORD PTR [rsp+144] + add r12, rbp + mulx r15, r14, QWORD PTR [rsp+152] + adc r13, rsi + adc r14, rdi + adc r15, 0 + add r11, QWORD PTR [rsp+96] + adc r12, QWORD PTR [rsp+104] + adc r13, QWORD PTR [rsp+112] + adc r14, QWORD PTR [rsp+120] + adc r15, 0 + shld r15, r14, 1 + btr r14, 63 + imul r15, r15, 19 + add r11, r15 + adc r12, 0 + adc r13, 0 + adc r14, 0 + mov QWORD PTR [rsp+96], r11 + mov QWORD PTR [rsp+104], r12 + mov QWORD PTR [rsp+112], r13 + mov QWORD PTR [rsp+120], r14 + mov rax, QWORD PTR [rsp+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp+128] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [rsp+112] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [rsp+104] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx rdi, r15, QWORD PTR [rsp+120] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [rsp+112] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+136] + mulx rcx, rdx, QWORD PTR [rsp+104] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [rsp+104] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+144] + mulx rcx, rdx, QWORD PTR [rsp+112] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+152] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [rsp+120] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rsp+120] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [rsp+128] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [rsp+120] + mulx rcx, rdx, QWORD PTR [rsp+144] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+152] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [rsp+112] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + ; Store + mov QWORD PTR [rsp], r11 + mov QWORD PTR [rsp+8], r12 + mov QWORD PTR [rsp+16], r13 + mov QWORD PTR [rsp+24], r14 + dec QWORD PTR [rsp+160] + jge L_curve25519_avx2_last_3 + ; Invert + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + mov rdx, rsp + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+96] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 4 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 19 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 9 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+128] + lea rdx, QWORD PTR [rsp+128] + mov r8, 99 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+128] + lea r8, QWORD PTR [rsp+96] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+96] + lea rdx, QWORD PTR [rsp+96] + mov r8, 49 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+96] + lea r8, QWORD PTR [rsp+64] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 4 + call fe_sq_n_avx2 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + mov r9, QWORD PTR [rsp+168] + mov rax, QWORD PTR [r9] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [rsp] + mulx r12, r11, rax + ; A[2] * B[0] + mulx r14, r13, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx rbx, rcx, QWORD PTR [r9+8] + xor rbp, rbp + adcx r12, rcx + ; A[3] * B[1] + mov rdx, QWORD PTR [rsp+8] + mulx rdi, r15, QWORD PTR [r9+24] + adcx r13, rbx + ; A[0] * B[1] + mulx rbx, rcx, rax + adox r12, rcx + ; A[2] * B[1] + mulx rsi, rcx, QWORD PTR [r9+16] + adox r13, rbx + adcx r14, rcx + ; A[1] * B[2] + mov rdx, QWORD PTR [rsp+16] + mulx rbx, rcx, QWORD PTR [r9+8] + adcx r15, rsi + adox r14, rcx + adcx rdi, rbp + adox r15, rbx + ; A[0] * B[2] + mulx rbx, rcx, rax + adox rdi, rbp + xor rsi, rsi + adcx r13, rcx + ; A[1] * B[1] + mov rdx, QWORD PTR [rsp+8] + mulx rcx, rdx, QWORD PTR [r9+8] + adcx r14, rbx + adox r13, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [rsp+24] + adox r14, rcx + mulx rbx, rcx, QWORD PTR [r9+8] + adcx r15, rcx + ; A[2] * B[2] + mov rdx, QWORD PTR [rsp+16] + mulx rcx, rdx, QWORD PTR [r9+16] + adcx rdi, rbx + adox r15, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [rsp+24] + adox rdi, rcx + mulx rbx, rcx, QWORD PTR [r9+24] + adox rsi, rbp + adcx rsi, rcx + ; A[0] * B[3] + mulx rcx, rdx, rax + adcx rbp, rbx + xor rbx, rbx + adcx r14, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx r15, rcx + mulx rcx, rdx, QWORD PTR [rsp] + adox r14, rdx + adox r15, rcx + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx rcx, rdx, QWORD PTR [rsp+16] + adcx rdi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [rsp+24] + adcx rsi, rcx + mulx rdx, rcx, QWORD PTR [r9+16] + adcx rbp, rbx + adox rdi, rcx + adox rsi, rdx + adox rbp, rbx + mov rdx, 38 + mulx rcx, rbp, rbp + add r14, rbp + adc rcx, 0 + mov rbx, 9223372036854775807 + shld rcx, r14, 1 + imul rcx, rcx, 19 + and r14, rbx + xor rbx, rbx + adox r11, rcx + mulx r15, rcx, r15 + adcx r11, rcx + adox r12, r15 + mulx rdi, rcx, rdi + adcx r12, rcx + adox r13, rdi + mulx rsi, rcx, rsi + adcx r13, rcx + adox r14, rsi + adcx r14, rbx + mov rbx, 9223372036854775807 + mov rdx, r14 + sar rdx, 63 + and rdx, 19 + and r14, rbx + add r11, rdx + adc r12, 0 + adc r13, 0 + adc r14, 0 + mov rcx, 9223372036854775807 + mov rdx, r11 + add rdx, 19 + mov rdx, r12 + adc rdx, 0 + mov rdx, r13 + adc rdx, 0 + mov rdx, r14 + adc rdx, 0 + sar rdx, 63 + and rdx, 19 + and r14, rcx + add r11, rdx + adc r12, 0 + adc r13, 0 + adc r14, 0 + and r14, rcx + ; Store + mov QWORD PTR [r9], r11 + mov QWORD PTR [r9+8], r12 + mov QWORD PTR [r9+16], r13 + mov QWORD PTR [r9+24], r14 + xor rax, rax + add rsp, 184 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +curve25519_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +fe_pow22523_avx2 PROC + sub rsp, 112 + ; pow22523 + mov QWORD PTR [rsp+96], rcx + mov QWORD PTR [rsp+104], rdx + mov rcx, rsp + mov rdx, QWORD PTR [rsp+104] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + mov rdx, QWORD PTR [rsp+104] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + mov rcx, rsp + mov rdx, rsp + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + mov rcx, rsp + mov rdx, rsp + call fe_sq_avx2 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 4 + call fe_sq_n_avx2 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 9 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 19 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 9 + call fe_sq_n_avx2 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + mov rdx, rsp + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 49 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+64] + lea rdx, QWORD PTR [rsp+64] + mov r8, 99 + call fe_sq_n_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+64] + lea r8, QWORD PTR [rsp+32] + call fe_mul_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + call fe_sq_avx2 + lea rcx, QWORD PTR [rsp+32] + lea rdx, QWORD PTR [rsp+32] + mov r8, 49 + call fe_sq_n_avx2 + mov rcx, rsp + lea rdx, QWORD PTR [rsp+32] + mov r8, rsp + call fe_mul_avx2 + mov rcx, rsp + mov rdx, rsp + call fe_sq_avx2 + mov rcx, rsp + mov rdx, rsp + call fe_sq_avx2 + mov rcx, QWORD PTR [rsp+96] + mov rdx, rsp + mov r8, QWORD PTR [rsp+104] + call fe_mul_avx2 + mov rdx, QWORD PTR [rsp+104] + mov rcx, QWORD PTR [rsp+96] + add rsp, 112 + ret +fe_pow22523_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p1p1_to_p2_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + sub rsp, 16 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], rax + lea r8, QWORD PTR [rax+96] + mov r11, QWORD PTR [rax] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, r11 + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r9 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r10 + ; A[0] * B[1] + mulx r10, r9, r11 + adox r13, r9 + ; A[2] * B[1] + mulx rbx, r9, QWORD PTR [rax+16] + adox r14, r10 + adcx r15, r9 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r9 + adcx rsi, rbp + adox rdi, r10 + ; A[0] * B[2] + mulx r10, r9, r11 + adox rsi, rbp + xor rbx, rbx + adcx r14, r9 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r9, rdx, QWORD PTR [rax+8] + adcx r15, r10 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r9 + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, r9 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r9, rdx, QWORD PTR [rax+16] + adcx rsi, r10 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r9 + ; A[0] * B[3] + mulx r9, rdx, r11 + adcx rbp, r10 + xor r10, r10 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r9 + mulx r9, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r9 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r9, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r9 + mulx rdx, r9, QWORD PTR [rax+16] + adcx rbp, r10 + adox rsi, r9 + adox rbx, rdx + adox rbp, r10 + mov rdx, 38 + mulx r9, rbp, rbp + add r15, rbp + adc r9, 0 + mov r10, 9223372036854775807 + shld r9, r15, 1 + imul r9, r9, 19 + and r15, r10 + xor r10, r10 + adox r12, r9 + mulx rdi, r9, rdi + adcx r12, r9 + adox r13, rdi + mulx rsi, r9, rsi + adcx r13, r9 + adox r14, rsi + mulx rbx, r9, rbx + adcx r14, r9 + adox r15, rbx + adcx r15, r10 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea rax, QWORD PTR [rax+64] + lea rcx, QWORD PTR [rcx+64] + mov r11, QWORD PTR [rax] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, r11 + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r9 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r10 + ; A[0] * B[1] + mulx r10, r9, r11 + adox r13, r9 + ; A[2] * B[1] + mulx rbx, r9, QWORD PTR [rax+16] + adox r14, r10 + adcx r15, r9 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r9 + adcx rsi, rbp + adox rdi, r10 + ; A[0] * B[2] + mulx r10, r9, r11 + adox rsi, rbp + xor rbx, rbx + adcx r14, r9 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r9, rdx, QWORD PTR [rax+8] + adcx r15, r10 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r9 + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, r9 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r9, rdx, QWORD PTR [rax+16] + adcx rsi, r10 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r9 + ; A[0] * B[3] + mulx r9, rdx, r11 + adcx rbp, r10 + xor r10, r10 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r9 + mulx r9, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r9 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r9, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r9 + mulx rdx, r9, QWORD PTR [rax+16] + adcx rbp, r10 + adox rsi, r9 + adox rbx, rdx + adox rbp, r10 + mov rdx, 38 + mulx r9, rbp, rbp + add r15, rbp + adc r9, 0 + mov r10, 9223372036854775807 + shld r9, r15, 1 + imul r9, r9, 19 + and r15, r10 + xor r10, r10 + adox r12, r9 + mulx rdi, r9, rdi + adcx r12, r9 + adox r13, rdi + mulx rsi, r9, rsi + adcx r13, r9 + adox r14, rsi + mulx rbx, r9, rbx + adcx r14, r9 + adox r15, rbx + adcx r15, r10 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [rax+-32] + lea rcx, QWORD PTR [rcx+-32] + mov r11, QWORD PTR [rax] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, r11 + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r9 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r10 + ; A[0] * B[1] + mulx r10, r9, r11 + adox r13, r9 + ; A[2] * B[1] + mulx rbx, r9, QWORD PTR [rax+16] + adox r14, r10 + adcx r15, r9 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r9 + adcx rsi, rbp + adox rdi, r10 + ; A[0] * B[2] + mulx r10, r9, r11 + adox rsi, rbp + xor rbx, rbx + adcx r14, r9 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r9, rdx, QWORD PTR [rax+8] + adcx r15, r10 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r9 + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, r9 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r9, rdx, QWORD PTR [rax+16] + adcx rsi, r10 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r9 + ; A[0] * B[3] + mulx r9, rdx, r11 + adcx rbp, r10 + xor r10, r10 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r9 + mulx r9, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r9 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r9, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r9 + mulx rdx, r9, QWORD PTR [rax+16] + adcx rbp, r10 + adox rsi, r9 + adox rbx, rdx + adox rbp, r10 + mov rdx, 38 + mulx r9, rbp, rbp + add r15, rbp + adc r9, 0 + mov r10, 9223372036854775807 + shld r9, r15, 1 + imul r9, r9, 19 + and r15, r10 + xor r10, r10 + adox r12, r9 + mulx rdi, r9, rdi + adcx r12, r9 + adox r13, rdi + mulx rsi, r9, rsi + adcx r13, r9 + adox r14, rsi + mulx rbx, r9, rbx + adcx r14, r9 + adox r15, rbx + adcx r15, r10 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + add rsp, 16 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_p1p1_to_p2_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p1p1_to_p3_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + sub rsp, 16 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], rax + lea r8, QWORD PTR [rax+96] + mov r11, QWORD PTR [rax] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, r11 + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r9 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r10 + ; A[0] * B[1] + mulx r10, r9, r11 + adox r13, r9 + ; A[2] * B[1] + mulx rbx, r9, QWORD PTR [rax+16] + adox r14, r10 + adcx r15, r9 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r9 + adcx rsi, rbp + adox rdi, r10 + ; A[0] * B[2] + mulx r10, r9, r11 + adox rsi, rbp + xor rbx, rbx + adcx r14, r9 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r9, rdx, QWORD PTR [rax+8] + adcx r15, r10 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r9 + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, r9 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r9, rdx, QWORD PTR [rax+16] + adcx rsi, r10 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r9 + ; A[0] * B[3] + mulx r9, rdx, r11 + adcx rbp, r10 + xor r10, r10 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r9 + mulx r9, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r9 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r9, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r9 + mulx rdx, r9, QWORD PTR [rax+16] + adcx rbp, r10 + adox rsi, r9 + adox rbx, rdx + adox rbp, r10 + mov rdx, 38 + mulx r9, rbp, rbp + add r15, rbp + adc r9, 0 + mov r10, 9223372036854775807 + shld r9, r15, 1 + imul r9, r9, 19 + and r15, r10 + xor r10, r10 + adox r12, r9 + mulx rdi, r9, rdi + adcx r12, r9 + adox r13, rdi + mulx rsi, r9, rsi + adcx r13, r9 + adox r14, rsi + mulx rbx, r9, rbx + adcx r14, r9 + adox r15, rbx + adcx r15, r10 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [rax+32] + lea rcx, QWORD PTR [rcx+96] + mov r11, QWORD PTR [rax] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, r11 + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r9 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r10 + ; A[0] * B[1] + mulx r10, r9, r11 + adox r13, r9 + ; A[2] * B[1] + mulx rbx, r9, QWORD PTR [rax+16] + adox r14, r10 + adcx r15, r9 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r9 + adcx rsi, rbp + adox rdi, r10 + ; A[0] * B[2] + mulx r10, r9, r11 + adox rsi, rbp + xor rbx, rbx + adcx r14, r9 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r9, rdx, QWORD PTR [rax+8] + adcx r15, r10 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r9 + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, r9 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r9, rdx, QWORD PTR [rax+16] + adcx rsi, r10 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r9 + ; A[0] * B[3] + mulx r9, rdx, r11 + adcx rbp, r10 + xor r10, r10 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r9 + mulx r9, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r9 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r9, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r9 + mulx rdx, r9, QWORD PTR [rax+16] + adcx rbp, r10 + adox rsi, r9 + adox rbx, rdx + adox rbp, r10 + mov rdx, 38 + mulx r9, rbp, rbp + add r15, rbp + adc r9, 0 + mov r10, 9223372036854775807 + shld r9, r15, 1 + imul r9, r9, 19 + and r15, r10 + xor r10, r10 + adox r12, r9 + mulx rdi, r9, rdi + adcx r12, r9 + adox r13, rdi + mulx rsi, r9, rsi + adcx r13, r9 + adox r14, rsi + mulx rbx, r9, rbx + adcx r14, r9 + adox r15, rbx + adcx r15, r10 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea rax, QWORD PTR [rax+64] + lea rcx, QWORD PTR [rcx+-64] + mov r11, QWORD PTR [rax] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, r11 + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r9 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r10 + ; A[0] * B[1] + mulx r10, r9, r11 + adox r13, r9 + ; A[2] * B[1] + mulx rbx, r9, QWORD PTR [rax+16] + adox r14, r10 + adcx r15, r9 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r9 + adcx rsi, rbp + adox rdi, r10 + ; A[0] * B[2] + mulx r10, r9, r11 + adox rsi, rbp + xor rbx, rbx + adcx r14, r9 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r9, rdx, QWORD PTR [rax+8] + adcx r15, r10 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r9 + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, r9 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r9, rdx, QWORD PTR [rax+16] + adcx rsi, r10 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r9 + ; A[0] * B[3] + mulx r9, rdx, r11 + adcx rbp, r10 + xor r10, r10 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r9 + mulx r9, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r9 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r9, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r9 + mulx rdx, r9, QWORD PTR [rax+16] + adcx rbp, r10 + adox rsi, r9 + adox rbx, rdx + adox rbp, r10 + mov rdx, 38 + mulx r9, rbp, rbp + add r15, rbp + adc r9, 0 + mov r10, 9223372036854775807 + shld r9, r15, 1 + imul r9, r9, 19 + and r15, r10 + xor r10, r10 + adox r12, r9 + mulx rdi, r9, rdi + adcx r12, r9 + adox r13, rdi + mulx rsi, r9, rsi + adcx r13, r9 + adox r14, rsi + mulx rbx, r9, rbx + adcx r14, r9 + adox r15, rbx + adcx r15, r10 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [rax+32] + lea rcx, QWORD PTR [rcx+32] + mov r11, QWORD PTR [rax] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, r11 + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r10, r9, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r9 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r10 + ; A[0] * B[1] + mulx r10, r9, r11 + adox r13, r9 + ; A[2] * B[1] + mulx rbx, r9, QWORD PTR [rax+16] + adox r14, r10 + adcx r15, r9 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r9 + adcx rsi, rbp + adox rdi, r10 + ; A[0] * B[2] + mulx r10, r9, r11 + adox rsi, rbp + xor rbx, rbx + adcx r14, r9 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r9, rdx, QWORD PTR [rax+8] + adcx r15, r10 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r9 + mulx r10, r9, QWORD PTR [rax+8] + adcx rdi, r9 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r9, rdx, QWORD PTR [rax+16] + adcx rsi, r10 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r9 + ; A[0] * B[3] + mulx r9, rdx, r11 + adcx rbp, r10 + xor r10, r10 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r9 + mulx r9, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r9 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r9, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r9 + mulx rdx, r9, QWORD PTR [rax+16] + adcx rbp, r10 + adox rsi, r9 + adox rbx, rdx + adox rbp, r10 + mov rdx, 38 + mulx r9, rbp, rbp + add r15, rbp + adc r9, 0 + mov r10, 9223372036854775807 + shld r9, r15, 1 + imul r9, r9, 19 + and r15, r10 + xor r10, r10 + adox r12, r9 + mulx rdi, r9, rdi + adcx r12, r9 + adox r13, rdi + mulx rsi, r9, rsi + adcx r13, r9 + adox r14, rsi + mulx rbx, r9, rbx + adcx r14, r9 + adox r15, rbx + adcx r15, r10 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + add rsp, 16 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_p1p1_to_p3_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_p2_dbl_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + sub rsp, 16 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], rax + lea rcx, QWORD PTR [rcx+64] + ; Square + mov rdx, QWORD PTR [rax] + mov r11, QWORD PTR [rax+8] + ; A[0] * A[1] + mov rbp, rdx + mulx r14, r13, r11 + ; A[0] * A[3] + mulx rdi, r15, QWORD PTR [rax+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [rax+16] + mulx r10, r9, r11 + xor r12, r12 + adox r15, r9 + ; A[2] * A[3] + mulx rbx, rsi, QWORD PTR [rax+24] + adox rdi, r10 + ; A[2] * A[0] + mulx r10, r9, rbp + adox rsi, r12 + adcx r14, r9 + adox rbx, r12 + ; A[1] * A[3] + mov rdx, r11 + mulx rdx, r9, QWORD PTR [rax+24] + adcx r15, r10 + adcx rdi, r9 + adcx rsi, rdx + adcx rbx, r12 + ; A[0] * A[0] + mov rdx, rbp + mulx r9, r12, rdx + xor rbp, rbp + adcx r13, r13 + ; A[1] * A[1] + mov rdx, r11 + adox r13, r9 + mulx r10, r9, rdx + adcx r14, r14 + adox r14, r9 + adcx r15, r15 + ; A[2] * A[2] + mov rdx, QWORD PTR [rax+16] + adox r15, r10 + mulx r9, r10, rdx + adcx rdi, rdi + adox rdi, r10 + adcx rsi, rsi + ; A[3] * A[3] + mov rdx, QWORD PTR [rax+24] + adox rsi, r9 + mulx r10, r9, rdx + adcx rbx, rbx + adox rbx, r9 + adcx rbp, rbp + adox rbp, r10 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r9, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r9 + xor r9, r9 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r9 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea rax, QWORD PTR [rax+32] + ; Square + mov rdx, QWORD PTR [rax] + mov r11, QWORD PTR [rax+8] + ; A[0] * A[1] + mov rbp, rdx + mulx r14, r13, r11 + ; A[0] * A[3] + mulx rdi, r15, QWORD PTR [rax+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [rax+16] + mulx r10, r9, r11 + xor r12, r12 + adox r15, r9 + ; A[2] * A[3] + mulx rbx, rsi, QWORD PTR [rax+24] + adox rdi, r10 + ; A[2] * A[0] + mulx r10, r9, rbp + adox rsi, r12 + adcx r14, r9 + adox rbx, r12 + ; A[1] * A[3] + mov rdx, r11 + mulx rdx, r9, QWORD PTR [rax+24] + adcx r15, r10 + adcx rdi, r9 + adcx rsi, rdx + adcx rbx, r12 + ; A[0] * A[0] + mov rdx, rbp + mulx r9, r12, rdx + xor rbp, rbp + adcx r13, r13 + ; A[1] * A[1] + mov rdx, r11 + adox r13, r9 + mulx r10, r9, rdx + adcx r14, r14 + adox r14, r9 + adcx r15, r15 + ; A[2] * A[2] + mov rdx, QWORD PTR [rax+16] + adox r15, r10 + mulx r9, r10, rdx + adcx rdi, rdi + adox rdi, r10 + adcx rsi, rsi + ; A[3] * A[3] + mov rdx, QWORD PTR [rax+24] + adox rsi, r9 + mulx r10, r9, rdx + adcx rbx, rbx + adox rbx, r9 + adcx rbp, rbp + adox rbp, r10 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r9, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r9 + xor r9, r9 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r9 + ; Store + mov rax, rcx + lea rcx, QWORD PTR [rcx+-32] + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + mov r8, QWORD PTR [rsp+8] + lea rax, QWORD PTR [r8+32] + lea rcx, QWORD PTR [rcx+-32] + ; Add + mov r12, QWORD PTR [rax] + mov r13, QWORD PTR [rax+8] + add r12, QWORD PTR [r8] + mov r14, QWORD PTR [rax+16] + adc r13, QWORD PTR [r8+8] + mov r15, QWORD PTR [rax+24] + adc r14, QWORD PTR [r8+16] + adc r15, QWORD PTR [r8+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + ; Square + mov rdx, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+8] + ; A[0] * A[1] + mov rbp, rdx + mulx r14, r13, r11 + ; A[0] * A[3] + mulx rdi, r15, QWORD PTR [rcx+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [rcx+16] + mulx r10, r9, r11 + xor r12, r12 + adox r15, r9 + ; A[2] * A[3] + mulx rbx, rsi, QWORD PTR [rcx+24] + adox rdi, r10 + ; A[2] * A[0] + mulx r10, r9, rbp + adox rsi, r12 + adcx r14, r9 + adox rbx, r12 + ; A[1] * A[3] + mov rdx, r11 + mulx rdx, r9, QWORD PTR [rcx+24] + adcx r15, r10 + adcx rdi, r9 + adcx rsi, rdx + adcx rbx, r12 + ; A[0] * A[0] + mov rdx, rbp + mulx r9, r12, rdx + xor rbp, rbp + adcx r13, r13 + ; A[1] * A[1] + mov rdx, r11 + adox r13, r9 + mulx r10, r9, rdx + adcx r14, r14 + adox r14, r9 + adcx r15, r15 + ; A[2] * A[2] + mov rdx, QWORD PTR [rcx+16] + adox r15, r10 + mulx r9, r10, rdx + adcx rdi, rdi + adox rdi, r10 + adcx rsi, rsi + ; A[3] * A[3] + mov rdx, QWORD PTR [rcx+24] + adox rsi, r9 + mulx r10, r9, rdx + adcx rbx, rbx + adox rbx, r9 + adcx rbp, rbp + adox rbp, r10 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r9, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r9 + xor r9, r9 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r9 + ; Store + lea rax, QWORD PTR [rcx+32] + ; Sub + sub r12, QWORD PTR [rax] + sbb r13, QWORD PTR [rax+8] + sbb r14, QWORD PTR [rax+16] + sbb r15, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, r15, 1 + imul rdx, -19 + btr r15, 63 + ; Add modulus (if underflow) + sub r12, rdx + sbb r13, 0 + sbb r14, 0 + sbb r15, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [r8+64] + ; Square * 2 + mov rdx, QWORD PTR [r8] + mov r11, QWORD PTR [r8+8] + ; A[0] * A[1] + mov rbp, rdx + mulx r14, r13, r11 + ; A[0] * A[3] + mulx rdi, r15, QWORD PTR [r8+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [r8+16] + mulx r10, r9, r11 + xor r12, r12 + adox r15, r9 + ; A[2] * A[3] + mulx rbx, rsi, QWORD PTR [r8+24] + adox rdi, r10 + ; A[2] * A[0] + mulx r10, r9, rbp + adox rsi, r12 + adcx r14, r9 + adox rbx, r12 + ; A[1] * A[3] + mov rdx, r11 + mulx rdx, r9, QWORD PTR [r8+24] + adcx r15, r10 + adcx rdi, r9 + adcx rsi, rdx + adcx rbx, r12 + ; A[0] * A[0] + mov rdx, rbp + mulx r9, r12, rdx + xor rbp, rbp + adcx r13, r13 + ; A[1] * A[1] + mov rdx, r11 + adox r13, r9 + mulx r10, r9, rdx + adcx r14, r14 + adox r14, r9 + adcx r15, r15 + ; A[2] * A[2] + mov rdx, QWORD PTR [r8+16] + adox r15, r10 + mulx r9, r10, rdx + adcx rdi, rdi + adox rdi, r10 + adcx rsi, rsi + ; A[3] * A[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r9 + mulx r10, r9, rdx + adcx rbx, rbx + adox rbx, r9 + adcx rbp, rbp + adox rbp, r10 + mov rdx, 38 + mulx r11, rbp, rbp + add r15, rbp + adc r11, 0 + mov r9, 9223372036854775807 + shld r11, r15, 1 + imul r11, r11, 19 + and r15, r9 + xor r9, r9 + adox r12, r11 + mulx rdi, r11, rdi + adcx r12, r11 + adox r13, rdi + mulx rsi, r11, rsi + adcx r13, r11 + adox r14, rsi + mulx rbx, r11, rbx + adcx r14, r11 + adox r15, rbx + adcx r15, r9 + mov r11, r15 + shld r15, r14, 1 + shld r14, r13, 1 + shld r13, r12, 1 + shl r12, 1 + mov r9, 9223372036854775807 + shr r11, 62 + and r15, r9 + imul r11, r11, 19 + add r12, r11 + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Store + lea rax, QWORD PTR [rcx+64] + lea rcx, QWORD PTR [rcx+96] + ; Sub + sub r12, QWORD PTR [rax] + sbb r13, QWORD PTR [rax+8] + sbb r14, QWORD PTR [rax+16] + sbb r15, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, r15, 1 + imul rdx, -19 + btr r15, 63 + ; Add modulus (if underflow) + sub r12, rdx + sbb r13, 0 + sbb r14, 0 + sbb r15, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + add rsp, 16 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_p2_dbl_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_madd_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], rax + mov QWORD PTR [rsp+16], r8 + lea r9, QWORD PTR [rax+96] + lea r8, QWORD PTR [r8+64] + lea rcx, QWORD PTR [rcx+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [r9] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [r9+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [r9+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [r9] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [r9+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [r9] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [r9+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [r9+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [r9+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [r9] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [r9+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov r9, rax + lea r8, QWORD PTR [rax+32] + lea rax, QWORD PTR [rcx+-64] + lea rcx, QWORD PTR [rcx+-96] + ; Add-Sub + ; Add + mov r12, QWORD PTR [r8] + mov r13, QWORD PTR [r8+8] + mov r14, QWORD PTR [r8+16] + mov r15, QWORD PTR [r8+24] + mov rdi, r12 + add r12, QWORD PTR [r9] + mov rsi, r13 + adc r13, QWORD PTR [r9+8] + mov rbx, r14 + adc r14, QWORD PTR [r9+16] + mov rbp, r15 + adc r15, QWORD PTR [r9+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r9] + sbb rsi, QWORD PTR [r9+8] + sbb rbx, QWORD PTR [r9+16] + sbb rbp, QWORD PTR [r9+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + mov r8, QWORD PTR [rsp+16] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rcx] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rcx+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rcx+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rcx+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rcx] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rcx+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rcx] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rcx+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rcx+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rcx+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rcx] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rcx+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rcx+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rcx+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [r8+32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rax] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rax] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rax+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rax] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rax+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rax+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rax] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rax+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + ; Add-Sub + ; Add + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+24] + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + lea r9, QWORD PTR [r9+64] + ; Double + mov r12, QWORD PTR [r9] + mov r13, QWORD PTR [r9+8] + add r12, r12 + mov r14, QWORD PTR [r9+16] + adc r13, r13 + mov r15, QWORD PTR [r9+24] + adc r14, r14 + adc r15, r15 + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + lea rax, QWORD PTR [rcx+96] + lea rcx, QWORD PTR [rcx+64] + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_madd_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_msub_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], rax + mov QWORD PTR [rsp+16], r8 + lea r9, QWORD PTR [rax+96] + lea r8, QWORD PTR [r8+64] + lea rcx, QWORD PTR [rcx+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [r9] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [r9+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [r9+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [r9] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [r9+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [r9] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [r9+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [r9+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [r9+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [r9] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [r9+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov r9, rax + lea r8, QWORD PTR [rax+32] + lea rax, QWORD PTR [rcx+-64] + lea rcx, QWORD PTR [rcx+-96] + ; Add-Sub + ; Add + mov r12, QWORD PTR [r8] + mov r13, QWORD PTR [r8+8] + mov r14, QWORD PTR [r8+16] + mov r15, QWORD PTR [r8+24] + mov rdi, r12 + add r12, QWORD PTR [r9] + mov rsi, r13 + adc r13, QWORD PTR [r9+8] + mov rbx, r14 + adc r14, QWORD PTR [r9+16] + mov rbp, r15 + adc r15, QWORD PTR [r9+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r9] + sbb rsi, QWORD PTR [r9+8] + sbb rbx, QWORD PTR [r9+16] + sbb rbp, QWORD PTR [r9+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + mov r8, QWORD PTR [rsp+16] + lea r8, QWORD PTR [r8+32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rcx] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rcx+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rcx+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rcx+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rcx] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rcx+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rcx] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rcx+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rcx+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rcx+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rcx] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rcx+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rcx+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rcx+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [r8+-32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rax] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rax] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rax+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rax] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rax+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rax+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rax] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rax+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + ; Add-Sub + ; Add + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+24] + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + lea r9, QWORD PTR [r9+64] + ; Double + mov r12, QWORD PTR [r9] + mov r13, QWORD PTR [r9+8] + add r12, r12 + mov r14, QWORD PTR [r9+16] + adc r13, r13 + mov r15, QWORD PTR [r9+24] + adc r14, r14 + adc r15, r15 + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + lea rax, QWORD PTR [rcx+96] + lea rcx, QWORD PTR [rcx+64] + ; Add-Sub + ; Add + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_msub_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_add_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], rax + mov QWORD PTR [rsp+16], r8 + lea r9, QWORD PTR [rax+96] + lea r8, QWORD PTR [r8+96] + lea rcx, QWORD PTR [rcx+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [r9] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [r9+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [r9+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [r9] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [r9+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [r9] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [r9+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [r9+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [r9+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [r9] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [r9+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov r9, rax + lea r8, QWORD PTR [rax+32] + lea rax, QWORD PTR [rcx+-64] + lea rcx, QWORD PTR [rcx+-96] + ; Add-Sub + ; Add + mov r12, QWORD PTR [r8] + mov r13, QWORD PTR [r8+8] + mov r14, QWORD PTR [r8+16] + mov r15, QWORD PTR [r8+24] + mov rdi, r12 + add r12, QWORD PTR [r9] + mov rsi, r13 + adc r13, QWORD PTR [r9+8] + mov rbx, r14 + adc r14, QWORD PTR [r9+16] + mov rbp, r15 + adc r15, QWORD PTR [r9+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r9] + sbb rsi, QWORD PTR [r9+8] + sbb rbx, QWORD PTR [r9+16] + sbb rbp, QWORD PTR [r9+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + mov r8, QWORD PTR [rsp+16] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rcx] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rcx+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rcx+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rcx+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rcx] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rcx+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rcx] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rcx+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rcx+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rcx+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rcx] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rcx+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rcx+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rcx+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [r8+32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rax] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rax] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rax+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rax] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rax+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rax+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rax] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rax+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + lea r9, QWORD PTR [r9+64] + lea r8, QWORD PTR [r8+32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [r9] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [r9+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [r9+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [r9] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [r9+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [r9] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [r9+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [r9+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [r9+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [r9] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [r9+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + lea rcx, QWORD PTR [rcx+64] + ; Double + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea rcx, QWORD PTR [rcx+-64] + ; Add-Sub + ; Add + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+24] + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + lea rax, QWORD PTR [rcx+96] + lea rcx, QWORD PTR [rcx+64] + ; Add-Sub + ; Add + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+24] + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_add_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +ge_sub_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov rax, rdx + sub rsp, 24 + mov QWORD PTR [rsp], rcx + mov QWORD PTR [rsp+8], rax + mov QWORD PTR [rsp+16], r8 + lea r9, QWORD PTR [rax+96] + lea r8, QWORD PTR [r8+96] + lea rcx, QWORD PTR [rcx+96] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [r9] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [r9+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [r9+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [r9] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [r9+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [r9] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [r9+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [r9+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [r9+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [r9] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [r9+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov r9, rax + lea r8, QWORD PTR [rax+32] + lea rax, QWORD PTR [rcx+-64] + lea rcx, QWORD PTR [rcx+-96] + ; Add-Sub + ; Add + mov r12, QWORD PTR [r8] + mov r13, QWORD PTR [r8+8] + mov r14, QWORD PTR [r8+16] + mov r15, QWORD PTR [r8+24] + mov rdi, r12 + add r12, QWORD PTR [r9] + mov rsi, r13 + adc r13, QWORD PTR [r9+8] + mov rbx, r14 + adc r14, QWORD PTR [r9+16] + mov rbp, r15 + adc r15, QWORD PTR [r9+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [r9] + sbb rsi, QWORD PTR [r9+8] + sbb rbx, QWORD PTR [r9+16] + sbb rbp, QWORD PTR [r9+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + mov r8, QWORD PTR [rsp+16] + lea r8, QWORD PTR [r8+32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rcx] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rcx+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rcx+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rcx+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rcx] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rcx+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rcx] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rcx+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rcx+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rcx+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rcx+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rcx] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rcx+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rcx+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rcx+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea r8, QWORD PTR [r8+-32] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [rax] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [rax+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [rax+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [rax+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [rax] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [rax+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [rax] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [rax+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [rax+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [rax+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [rax+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [rax] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [rax+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [rax+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [rax+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + lea r9, QWORD PTR [r9+64] + lea r8, QWORD PTR [r8+64] + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r8] + mulx r13, r12, QWORD PTR [r9] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx r11, r10, QWORD PTR [r9+8] + xor rbp, rbp + adcx r13, r10 + ; A[3] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx rsi, rdi, QWORD PTR [r9+24] + adcx r14, r11 + ; A[0] * B[1] + mulx r11, r10, QWORD PTR [r9] + adox r13, r10 + ; A[2] * B[1] + mulx rbx, r10, QWORD PTR [r9+16] + adox r14, r11 + adcx r15, r10 + ; A[1] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, rbx + adox r15, r10 + adcx rsi, rbp + adox rdi, r11 + ; A[0] * B[2] + mulx r11, r10, QWORD PTR [r9] + adox rsi, rbp + xor rbx, rbx + adcx r14, r10 + ; A[1] * B[1] + mov rdx, QWORD PTR [r8+8] + mulx r10, rdx, QWORD PTR [r9+8] + adcx r15, r11 + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r8+24] + adox r15, r10 + mulx r11, r10, QWORD PTR [r9+8] + adcx rdi, r10 + ; A[2] * B[2] + mov rdx, QWORD PTR [r8+16] + mulx r10, rdx, QWORD PTR [r9+16] + adcx rsi, r11 + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r8+24] + adox rsi, r10 + mulx r11, r10, QWORD PTR [r9+24] + adox rbx, rbp + adcx rbx, r10 + ; A[0] * B[3] + mulx r10, rdx, QWORD PTR [r9] + adcx rbp, r11 + xor r11, r11 + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx rdi, r10 + mulx r10, rdx, QWORD PTR [r8] + adox r15, rdx + adox rdi, r10 + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx r10, rdx, QWORD PTR [r8+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r8+24] + adcx rbx, r10 + mulx rdx, r10, QWORD PTR [r9+16] + adcx rbp, r11 + adox rsi, r10 + adox rbx, rdx + adox rbp, r11 + mov rdx, 38 + mulx r10, rbp, rbp + add r15, rbp + adc r10, 0 + mov r11, 9223372036854775807 + shld r10, r15, 1 + imul r10, r10, 19 + and r15, r11 + xor r11, r11 + adox r12, r10 + mulx rdi, r10, rdi + adcx r12, r10 + adox r13, rdi + mulx rsi, r10, rsi + adcx r13, r10 + adox r14, rsi + mulx rbx, r10, rbx + adcx r14, r10 + adox r15, rbx + adcx r15, r11 + ; Store + lea rcx, QWORD PTR [rcx+64] + ; Double + add r12, r12 + adc r13, r13 + adc r14, r14 + adc r15, r15 + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + lea rcx, QWORD PTR [rcx+-64] + ; Add-Sub + ; Add + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+24] + mov rdi, r12 + add r12, QWORD PTR [rax] + mov rsi, r13 + adc r13, QWORD PTR [rax+8] + mov rbx, r14 + adc r14, QWORD PTR [rax+16] + mov rbp, r15 + adc r15, QWORD PTR [rax+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rax] + sbb rsi, QWORD PTR [rax+8] + sbb rbx, QWORD PTR [rax+16] + sbb rbp, QWORD PTR [rax+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rax], r12 + mov QWORD PTR [rax+8], r13 + mov QWORD PTR [rax+16], r14 + mov QWORD PTR [rax+24], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+16], rbx + mov QWORD PTR [rcx+24], rbp + lea rax, QWORD PTR [rcx+64] + lea rcx, QWORD PTR [rcx+96] + ; Add-Sub + ; Add + mov r12, QWORD PTR [rax] + mov r13, QWORD PTR [rax+8] + mov r14, QWORD PTR [rax+16] + mov r15, QWORD PTR [rax+24] + mov rdi, r12 + add r12, QWORD PTR [rcx] + mov rsi, r13 + adc r13, QWORD PTR [rcx+8] + mov rbx, r14 + adc r14, QWORD PTR [rcx+16] + mov rbp, r15 + adc r15, QWORD PTR [rcx+24] + mov rdx, 0 + adc rdx, 0 + shld rdx, r15, 1 + imul rdx, 19 + btr r15, 63 + ; Sub modulus (if overflow) + add r12, rdx + adc r13, 0 + adc r14, 0 + adc r15, 0 + ; Sub + sub rdi, QWORD PTR [rcx] + sbb rsi, QWORD PTR [rcx+8] + sbb rbx, QWORD PTR [rcx+16] + sbb rbp, QWORD PTR [rcx+24] + sbb rdx, rdx + shld rdx, rbp, 1 + imul rdx, -19 + btr rbp, 63 + ; Add modulus (if underflow) + sub rdi, rdx + sbb rsi, 0 + sbb rbx, 0 + sbb rbp, 0 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rax], rdi + mov QWORD PTR [rax+8], rsi + mov QWORD PTR [rax+16], rbx + mov QWORD PTR [rax+24], rbp + add rsp, 24 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +ge_sub_avx2 ENDP +_TEXT ENDS +IFDEF HAVE_ED25519 +_TEXT SEGMENT READONLY PARA +fe_sq2_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + ; Square * 2 + mov rdx, QWORD PTR [rsi] + mov rax, QWORD PTR [rsi+8] + ; A[0] * A[1] + mov r15, rdx + mulx r10, r9, rax + ; A[0] * A[3] + mulx r12, r11, QWORD PTR [rsi+24] + ; A[2] * A[1] + mov rdx, QWORD PTR [rsi+16] + mulx rbx, rcx, rax + xor r8, r8 + adox r11, rcx + ; A[2] * A[3] + mulx r14, r13, QWORD PTR [rsi+24] + adox r12, rbx + ; A[2] * A[0] + mulx rbx, rcx, r15 + adox r13, r8 + adcx r10, rcx + adox r14, r8 + ; A[1] * A[3] + mov rdx, rax + mulx rdx, rcx, QWORD PTR [rsi+24] + adcx r11, rbx + adcx r12, rcx + adcx r13, rdx + adcx r14, r8 + ; A[0] * A[0] + mov rdx, r15 + mulx rcx, r8, rdx + xor r15, r15 + adcx r9, r9 + ; A[1] * A[1] + mov rdx, rax + adox r9, rcx + mulx rbx, rcx, rdx + adcx r10, r10 + adox r10, rcx + adcx r11, r11 + ; A[2] * A[2] + mov rdx, QWORD PTR [rsi+16] + adox r11, rbx + mulx rcx, rbx, rdx + adcx r12, r12 + adox r12, rbx + adcx r13, r13 + ; A[3] * A[3] + mov rdx, QWORD PTR [rsi+24] + adox r13, rcx + mulx rbx, rcx, rdx + adcx r14, r14 + adox r14, rcx + adcx r15, r15 + adox r15, rbx + mov rdx, 38 + mulx rax, r15, r15 + add r11, r15 + adc rax, 0 + mov rcx, 9223372036854775807 + shld rax, r11, 1 + imul rax, rax, 19 + and r11, rcx + xor rcx, rcx + adox r8, rax + mulx r12, rax, r12 + adcx r8, rax + adox r9, r12 + mulx r13, rax, r13 + adcx r9, rax + adox r10, r13 + mulx r14, rax, r14 + adcx r10, rax + adox r11, r14 + adcx r11, rcx + mov rax, r11 + shld r11, r10, 1 + shld r10, r9, 1 + shld r9, r8, 1 + shl r8, 1 + mov rcx, 9223372036854775807 + shr rax, 62 + and r11, rcx + imul rax, rax, 19 + add r8, rax + adc r9, 0 + adc r10, 0 + adc r11, 0 + ; Store + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +fe_sq2_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +sc_reduce_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r8, rcx + mov r9, QWORD PTR [r8] + mov r10, QWORD PTR [r8+8] + mov r11, QWORD PTR [r8+16] + mov r12, QWORD PTR [r8+24] + mov r13, QWORD PTR [r8+32] + mov r14, QWORD PTR [r8+40] + mov r15, QWORD PTR [r8+48] + mov rdi, QWORD PTR [r8+56] + mov rax, rdi + mov rcx, 1152921504606846975 + shr rax, 56 + shld rdi, r15, 4 + shld r15, r14, 4 + shld r14, r13, 4 + shld r13, r12, 4 + and r12, rcx + and rdi, rcx + ; Add order times bits 504..511 + sub r15, rax + sbb rdi, 0 + mov rdx, 16942830013509034793 + mulx rcx, rsi, rax + mov rdx, 12100500283911187475 + add r14, rsi + mulx rbx, rsi, rax + adc rcx, 0 + add r13, rsi + adc r14, rbx + adc r15, rcx + adc rdi, 0 + ; Sub product of top 4 words and order + mov rdx, 12100500283911187475 + mulx rax, rcx, r13 + add r9, rcx + adc r10, rax + mulx rax, rcx, r15 + adc r11, rcx + adc r12, rax + mov rsi, 0 + adc rsi, 0 + mulx rax, rcx, r14 + add r10, rcx + adc r11, rax + mulx rax, rcx, rdi + adc r12, rcx + adc rsi, rax + mov rdx, 16942830013509034793 + mulx rax, rcx, r13 + add r10, rcx + adc r11, rax + mulx rax, rcx, r15 + adc r12, rcx + adc rsi, rax + mov rbx, 0 + adc rbx, 0 + mulx rax, rcx, r14 + add r11, rcx + adc r12, rax + mulx rax, rcx, rdi + adc rsi, rcx + adc rbx, rax + sub r11, r13 + mov r13, rsi + sbb r12, r14 + mov r14, rbx + sbb r13, r15 + sbb r14, rdi + mov rax, r14 + sar rax, 57 + ; Conditionally subtract order starting at bit 125 + mov rsi, 11529215046068469760 + mov rbx, 14628338529006959229 + mov rbp, 187989257525064602 + mov rcx, 144115188075855872 + and rsi, rax + and rbx, rax + and rbp, rax + and rcx, rax + add r10, rsi + adc r11, rbx + adc r12, rbp + adc r13, 0 + adc r14, rcx + ; Move bits 252-376 to own registers + mov rax, 1152921504606846975 + shld r14, r13, 4 + shld r13, r12, 4 + and r12, rax + ; Sub product of top 2 words and order + ; * -5812631a5cf5d3ed + mov rdx, 12100500283911187475 + mulx rax, rbp, r13 + mov rsi, 0 + add r9, rbp + adc r10, rax + mulx rax, rbp, r14 + adc rsi, 0 + add r10, rbp + adc rsi, rax + ; * -14def9dea2f79cd7 + mov rdx, 16942830013509034793 + mulx rax, rbp, r13 + mov rbx, 0 + add r10, rbp + adc r11, rax + mulx rax, rbp, r14 + adc rbx, 0 + add r11, rbp + adc rbx, rax + ; Add overflows at 2 * 64 + mov rcx, 1152921504606846975 + and r12, rcx + add r11, rsi + adc r12, rbx + ; Subtract top at 2 * 64 + sub r11, r13 + sbb r12, r14 + sbb rcx, rcx + ; Conditional sub order + mov rsi, 6346243789798364141 + mov rbx, 1503914060200516822 + mov rbp, 1152921504606846976 + and rsi, rcx + and rbx, rcx + and rbp, rcx + add r9, rsi + mov rsi, 1152921504606846975 + adc r10, rbx + adc r11, 0 + adc r12, rbp + and r12, rsi + ; Store result + mov QWORD PTR [r8], r9 + mov QWORD PTR [r8+8], r10 + mov QWORD PTR [r8+16], r11 + mov QWORD PTR [r8+24], r12 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sc_reduce_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +sc_muladd_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + mov r10, r8 + mov r8, rcx + mov r11, r9 + mov r9, rdx + ; Multiply + ; A[0] * B[0] + mov rdx, QWORD PTR [r10] + mulx r13, r12, QWORD PTR [r9] + ; A[2] * B[0] + mulx r15, r14, QWORD PTR [r9+16] + ; A[1] * B[0] + mulx rcx, rax, QWORD PTR [r9+8] + xor rbp, rbp + adcx r13, rax + ; A[3] * B[1] + mov rdx, QWORD PTR [r10+8] + mulx rsi, rdi, QWORD PTR [r9+24] + adcx r14, rcx + ; A[0] * B[1] + mulx rcx, rax, QWORD PTR [r9] + adox r13, rax + ; A[2] * B[1] + mulx rbx, rax, QWORD PTR [r9+16] + adox r14, rcx + adcx r15, rax + ; A[1] * B[2] + mov rdx, QWORD PTR [r10+16] + mulx rcx, rax, QWORD PTR [r9+8] + adcx rdi, rbx + adox r15, rax + adcx rsi, rbp + adox rdi, rcx + ; A[0] * B[2] + mulx rcx, rax, QWORD PTR [r9] + adox rsi, rbp + xor rbx, rbx + adcx r14, rax + ; A[1] * B[1] + mov rdx, QWORD PTR [r10+8] + mulx rax, rdx, QWORD PTR [r9+8] + adcx r15, rcx + adox r14, rdx + ; A[1] * B[3] + mov rdx, QWORD PTR [r10+24] + adox r15, rax + mulx rcx, rax, QWORD PTR [r9+8] + adcx rdi, rax + ; A[2] * B[2] + mov rdx, QWORD PTR [r10+16] + mulx rax, rdx, QWORD PTR [r9+16] + adcx rsi, rcx + adox rdi, rdx + ; A[3] * B[3] + mov rdx, QWORD PTR [r10+24] + adox rsi, rax + mulx rcx, rax, QWORD PTR [r9+24] + adox rbx, rbp + adcx rbx, rax + ; A[0] * B[3] + mulx rax, rdx, QWORD PTR [r9] + adcx rbp, rcx + xor rcx, rcx + adcx r15, rdx + ; A[3] * B[0] + mov rdx, QWORD PTR [r9+24] + adcx rdi, rax + mulx rax, rdx, QWORD PTR [r10] + adox r15, rdx + adox rdi, rax + ; A[3] * B[2] + mov rdx, QWORD PTR [r9+24] + mulx rax, rdx, QWORD PTR [r10+16] + adcx rsi, rdx + ; A[2] * B[3] + mov rdx, QWORD PTR [r10+24] + adcx rbx, rax + mulx rdx, rax, QWORD PTR [r9+16] + adcx rbp, rcx + adox rsi, rax + adox rbx, rdx + adox rbp, rcx + ; Add c to a * b + add r12, QWORD PTR [r11] + adc r13, QWORD PTR [r11+8] + adc r14, QWORD PTR [r11+16] + adc r15, QWORD PTR [r11+24] + adc rdi, 0 + adc rsi, 0 + adc rbx, 0 + adc rbp, 0 + mov rax, rbp + mov rcx, 1152921504606846975 + shr rax, 56 + shld rbp, rbx, 4 + shld rbx, rsi, 4 + shld rsi, rdi, 4 + shld rdi, r15, 4 + and r15, rcx + and rbp, rcx + ; Add order times bits 504..507 + sub rbx, rax + sbb rbp, 0 + mov rdx, 16942830013509034793 + mulx rcx, r9, rax + mov rdx, 12100500283911187475 + add rsi, r9 + mulx r10, r9, rax + adc rcx, 0 + add rdi, r9 + adc rsi, r10 + adc rbx, rcx + adc rbp, 0 + ; Sub product of top 4 words and order + mov rdx, 12100500283911187475 + mulx rax, rcx, rdi + add r12, rcx + adc r13, rax + mulx rax, rcx, rbx + adc r14, rcx + adc r15, rax + mov r9, 0 + adc r9, 0 + mulx rax, rcx, rsi + add r13, rcx + adc r14, rax + mulx rax, rcx, rbp + adc r15, rcx + adc r9, rax + mov rdx, 16942830013509034793 + mulx rax, rcx, rdi + add r13, rcx + adc r14, rax + mulx rax, rcx, rbx + adc r15, rcx + adc r9, rax + mov r10, 0 + adc r10, 0 + mulx rax, rcx, rsi + add r14, rcx + adc r15, rax + mulx rax, rcx, rbp + adc r9, rcx + adc r10, rax + sub r14, rdi + mov rdi, r9 + sbb r15, rsi + mov rsi, r10 + sbb rdi, rbx + sbb rsi, rbp + mov rax, rsi + sar rax, 57 + ; Conditionally subtract order starting at bit 125 + mov r9, 11529215046068469760 + mov r10, 14628338529006959229 + mov r11, 187989257525064602 + mov rcx, 144115188075855872 + and r9, rax + and r10, rax + and r11, rax + and rcx, rax + add r13, r9 + adc r14, r10 + adc r15, r11 + adc rdi, 0 + adc rsi, rcx + ; Move bits 252-376 to own registers + mov rax, 1152921504606846975 + shld rsi, rdi, 4 + shld rdi, r15, 4 + and r15, rax + ; Sub product of top 2 words and order + ; * -5812631a5cf5d3ed + mov rdx, 12100500283911187475 + mulx rax, r11, rdi + mov r9, 0 + add r12, r11 + adc r13, rax + mulx rax, r11, rsi + adc r9, 0 + add r13, r11 + adc r9, rax + ; * -14def9dea2f79cd7 + mov rdx, 16942830013509034793 + mulx rax, r11, rdi + mov r10, 0 + add r13, r11 + adc r14, rax + mulx rax, r11, rsi + adc r10, 0 + add r14, r11 + adc r10, rax + ; Add overflows at 2 * 64 + mov rcx, 1152921504606846975 + and r15, rcx + add r14, r9 + adc r15, r10 + ; Subtract top at 2 * 64 + sub r14, rdi + sbb r15, rsi + sbb rcx, rcx + ; Conditional sub order + mov r9, 6346243789798364141 + mov r10, 1503914060200516822 + mov r11, 1152921504606846976 + and r9, rcx + and r10, rcx + and r11, rcx + add r12, r9 + mov r9, 1152921504606846975 + adc r13, r10 + adc r14, 0 + adc r15, r11 + and r15, r9 + ; Store result + mov QWORD PTR [r8], r12 + mov QWORD PTR [r8+8], r13 + mov QWORD PTR [r8+16], r14 + mov QWORD PTR [r8+24], r15 + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sc_muladd_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_prime DWORD 03ffffedh, 03ffffffh, 03ffffffh, 03ffffffh + DWORD 03ffffffh, 00000000h, 00000000h, 00000000h + DWORD 03ffffffh, 03ffffffh, 03ffffffh, 03ffffffh + DWORD 001fffffh, 00000000h, 00000000h, 00000000h +ptr_L_fe_invert_nct_avx2_prime QWORD L_fe_invert_nct_avx2_prime +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_one QWORD 0000000000000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h +ptr_L_fe_invert_nct_avx2_one QWORD L_fe_invert_nct_avx2_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_all_one DWORD 00000001h, 00000001h, 00000001h, 00000001h + DWORD 00000001h, 00000001h, 00000001h, 00000001h +ptr_L_fe_invert_nct_avx2_all_one QWORD L_fe_invert_nct_avx2_all_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_mask01111 DWORD 00000000h, 00000001h, 00000001h, 00000001h + DWORD 00000001h, 00000000h, 00000000h, 00000000h +ptr_L_fe_invert_nct_avx2_mask01111 QWORD L_fe_invert_nct_avx2_mask01111 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_down_one_dword DWORD 00000001h, 00000002h, 00000003h, 00000004h + DWORD 00000005h, 00000006h, 00000007h, 00000007h +ptr_L_fe_invert_nct_avx2_down_one_dword QWORD L_fe_invert_nct_avx2_down_one_dword +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_neg DWORD 00000000h, 00000000h, 00000000h, 00000000h + DWORD 80000000h, 00000000h, 00000000h, 00000000h +ptr_L_fe_invert_nct_avx2_neg QWORD L_fe_invert_nct_avx2_neg +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_up_one_dword DWORD 00000007h, 00000000h, 00000001h, 00000002h + DWORD 00000003h, 00000007h, 00000007h, 00000007h +ptr_L_fe_invert_nct_avx2_up_one_dword QWORD L_fe_invert_nct_avx2_up_one_dword +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_fe_invert_nct_avx2_mask26 DWORD 03ffffffh, 03ffffffh, 03ffffffh, 03ffffffh + DWORD 03ffffffh, 00000000h, 00000000h, 00000000h +ptr_L_fe_invert_nct_avx2_mask26 QWORD L_fe_invert_nct_avx2_mask26 +_DATA ENDS +; /* Non-constant time modular inversion. +; * +; * @param [out] r Resulting number. +; * @param [in] a Number to invert. +; * @param [in] m Modulus. +; * @return MP_OKAY on success. +; */ +_TEXT SEGMENT READONLY PARA +fe_invert_nct_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + sub rsp, 144 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + mov r8, -19 + mov r9, -1 + mov r10, -1 + mov r11, 9223372036854775807 + mov r12, QWORD PTR [rdx] + mov r13, QWORD PTR [rdx+8] + mov r14, QWORD PTR [rdx+16] + mov r15, QWORD PTR [rdx+24] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_prime] + vmovupd ymm6, YMMWORD PTR [rbx] + vmovupd ymm7, YMMWORD PTR [rbx+32] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_one] + vmovupd ymm8, YMMWORD PTR [rbx] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_mask01111] + vmovupd ymm9, YMMWORD PTR [rbx] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_all_one] + vmovupd ymm10, YMMWORD PTR [rbx] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_down_one_dword] + vmovupd ymm11, YMMWORD PTR [rbx] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_neg] + vmovupd ymm12, YMMWORD PTR [rbx] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_up_one_dword] + vmovupd ymm13, YMMWORD PTR [rbx] + mov rbx, QWORD PTR [ptr_L_fe_invert_nct_avx2_mask26] + vmovupd ymm14, YMMWORD PTR [rbx] + vpxor xmm0, xmm0, xmm0 + vpxor xmm1, xmm1, xmm1 + vmovdqu ymm2, ymm8 + vpxor xmm3, xmm3, xmm3 + test r12b, 1 + jnz L_fe_invert_nct_avx2_v_even_end +L_fe_invert_nct_avx2_v_even_start: + shrd r12, r13, 1 + shrd r13, r14, 1 + shrd r14, r15, 1 + shr r15, 1 + vptest ymm2, ymm8 + jz L_fe_invert_nct_avx2_v_even_shr1 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 +L_fe_invert_nct_avx2_v_even_shr1: + vpand ymm4, ymm2, ymm9 + vpand ymm5, ymm3, ymm10 + vpermd ymm4, ymm11, ymm4 + vpsrad ymm2, ymm2, 1 + vpsrad ymm3, ymm3, 1 + vpslld ymm5, ymm5, 25 + vpslld xmm4, xmm4, 25 + vpaddd ymm2, ymm2, ymm5 + vpaddd ymm3, ymm3, ymm4 + test r12b, 1 + jz L_fe_invert_nct_avx2_v_even_start +L_fe_invert_nct_avx2_v_even_end: +L_fe_invert_nct_avx2_uv_start: + cmp r11, r15 + jb L_fe_invert_nct_avx2_uv_v + ja L_fe_invert_nct_avx2_uv_u + cmp r10, r14 + jb L_fe_invert_nct_avx2_uv_v + ja L_fe_invert_nct_avx2_uv_u + cmp r9, r13 + jb L_fe_invert_nct_avx2_uv_v + ja L_fe_invert_nct_avx2_uv_u + cmp r8, r12 + jb L_fe_invert_nct_avx2_uv_v +L_fe_invert_nct_avx2_uv_u: + sub r8, r12 + sbb r9, r13 + vpsubd ymm0, ymm0, ymm2 + sbb r10, r14 + vpsubd ymm1, ymm1, ymm3 + sbb r11, r15 + vptest ymm1, ymm12 + jz L_fe_invert_nct_avx2_usubv_done_neg + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm1, ymm1, ymm7 +L_fe_invert_nct_avx2_usubv_done_neg: +L_fe_invert_nct_avx2_usubv_shr1: + shrd r8, r9, 1 + shrd r9, r10, 1 + shrd r10, r11, 1 + shr r11, 1 + vptest ymm0, ymm8 + jz L_fe_invert_nct_avx2_usubv_sub_shr1 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm1, ymm1, ymm7 +L_fe_invert_nct_avx2_usubv_sub_shr1: + vpand ymm4, ymm0, ymm9 + vpand ymm5, ymm1, ymm10 + vpermd ymm4, ymm11, ymm4 + vpsrad ymm0, ymm0, 1 + vpsrad ymm1, ymm1, 1 + vpslld ymm5, ymm5, 25 + vpslld xmm4, xmm4, 25 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm4 + test r8b, 1 + jz L_fe_invert_nct_avx2_usubv_shr1 + cmp r8, 1 + jne L_fe_invert_nct_avx2_uv_start + mov rax, r9 + or rax, r10 + jne L_fe_invert_nct_avx2_uv_start + or rax, r11 + jne L_fe_invert_nct_avx2_uv_start + vpextrd r8d, xmm0, 0 + vpextrd r10d, xmm0, 1 + vpextrd r12d, xmm0, 2 + vpextrd r14d, xmm0, 3 + vpextrd r9d, xmm1, 0 + vpextrd r11d, xmm1, 1 + vpextrd r13d, xmm1, 2 + vpextrd r15d, xmm1, 3 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpextrd edi, xmm0, 0 + vpextrd esi, xmm1, 0 + jmp L_fe_invert_nct_avx2_store_done +L_fe_invert_nct_avx2_uv_v: + sub r12, r8 + sbb r13, r9 + vpsubd ymm2, ymm2, ymm0 + sbb r14, r10 + vpsubd ymm3, ymm3, ymm1 + sbb r15, r11 + vptest ymm3, ymm12 + jz L_fe_invert_nct_avx2_vsubu_done_neg + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 +L_fe_invert_nct_avx2_vsubu_done_neg: +L_fe_invert_nct_avx2_vsubu_shr1: + shrd r12, r13, 1 + shrd r13, r14, 1 + shrd r14, r15, 1 + shr r15, 1 + vptest ymm2, ymm8 + jz L_fe_invert_nct_avx2_vsubu_sub_shr1 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 +L_fe_invert_nct_avx2_vsubu_sub_shr1: + vpand ymm4, ymm2, ymm9 + vpand ymm5, ymm3, ymm10 + vpermd ymm4, ymm11, ymm4 + vpsrad ymm2, ymm2, 1 + vpsrad ymm3, ymm3, 1 + vpslld ymm5, ymm5, 25 + vpslld xmm4, xmm4, 25 + vpaddd ymm2, ymm2, ymm5 + vpaddd ymm3, ymm3, ymm4 + test r12b, 1 + jz L_fe_invert_nct_avx2_vsubu_shr1 + cmp r12, 1 + jne L_fe_invert_nct_avx2_uv_start + mov rax, r13 + or rax, r14 + jne L_fe_invert_nct_avx2_uv_start + or rax, r15 + jne L_fe_invert_nct_avx2_uv_start + vpextrd r8d, xmm2, 0 + vpextrd r10d, xmm2, 1 + vpextrd r12d, xmm2, 2 + vpextrd r14d, xmm2, 3 + vpextrd r9d, xmm3, 0 + vpextrd r11d, xmm3, 1 + vpextrd r13d, xmm3, 2 + vpextrd r15d, xmm3, 3 + vextracti128 xmm2, ymm2, 1 + vextracti128 xmm3, ymm3, 1 + vpextrd edi, xmm2, 0 + vpextrd esi, xmm3, 0 +L_fe_invert_nct_avx2_store_done: + mov eax, r8d + and r8d, 67108863 + sar eax, 26 + add r9d, eax + mov eax, r9d + and r9d, 67108863 + sar eax, 26 + add r10d, eax + mov eax, r10d + and r10d, 67108863 + sar eax, 26 + add r11d, eax + mov eax, r11d + and r11d, 67108863 + sar eax, 26 + add r12d, eax + mov eax, r12d + and r12d, 67108863 + sar eax, 26 + add r13d, eax + mov eax, r13d + and r13d, 67108863 + sar eax, 26 + add r14d, eax + mov eax, r14d + and r14d, 67108863 + sar eax, 26 + add r15d, eax + mov eax, r15d + and r15d, 67108863 + sar eax, 26 + add edi, eax + mov eax, edi + and edi, 67108863 + sar eax, 26 + add esi, eax + movsxd r9, r9d + movsxd r11, r11d + movsxd r13, r13d + movsxd r15, r15d + movsxd rsi, esi + shl r9, 26 + shl r11, 26 + shl r13, 26 + shl r15, 26 + shl rsi, 26 + movsxd r8, r8d + add r8, r9 + movsxd r10, r10d + adc r10, r11 + movsxd r12, r12d + adc r12, r13 + movsxd r14, r14d + adc r14, r15 + movsxd rdi, edi + adc rdi, rsi + jge L_fe_invert_nct_avx2_uv_start_no_add_prime + mov r9, 4503599627370477 + mov r11, 4503599627370495 + mov r13, 4503599627370495 + mov r15, 4503599627370495 + mov rsi, 140737488355327 + add r8, r9 + add r10, r11 + add r12, r13 + add r14, r15 + add rdi, rsi + mov rax, 4503599627370495 + mov r9, r8 + and r8, rax + sar r9, 52 + add r10, r9 + mov r11, r10 + and r10, rax + sar r11, 52 + add r12, r11 + mov r13, r12 + and r12, rax + sar r13, 52 + add r14, r13 + mov r15, r14 + and r14, rax + sar r15, 52 + add rdi, r15 +L_fe_invert_nct_avx2_uv_start_no_add_prime: + mov r9, r10 + mov r11, r12 + mov r13, r14 + shl r9, 52 + sar r10, 12 + shl r11, 40 + sar r12, 24 + shl r13, 28 + sar r14, 36 + shl rdi, 16 + add r8, r9 + adc r10, r11 + adc r12, r13 + adc r14, rdi + mov QWORD PTR [rcx], r8 + mov QWORD PTR [rcx+8], r10 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r14 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +fe_invert_nct_avx2 ENDP +_TEXT ENDS +ENDIF +ENDIF +END diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index 908c43984cd..dba03978c44 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -18,9 +18,16 @@ EXTRA_DIST += wolfcrypt/src/asm.c EXTRA_DIST += wolfcrypt/src/aes_asm.asm EXTRA_DIST += wolfcrypt/src/aes_x86_64_asm.asm EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm +EXTRA_DIST += wolfcrypt/src/aes_gcm_x86_asm.asm EXTRA_DIST += wolfcrypt/src/aes_xts_asm.asm EXTRA_DIST += wolfcrypt/src/chacha_asm.asm EXTRA_DIST += wolfcrypt/src/poly1305_asm.asm +EXTRA_DIST += wolfcrypt/src/fe_x25519_asm.asm +EXTRA_DIST += wolfcrypt/src/sha256_asm.asm +EXTRA_DIST += wolfcrypt/src/sha512_asm.asm +EXTRA_DIST += wolfcrypt/src/sha3_asm.asm +EXTRA_DIST += wolfcrypt/src/wc_mlkem_asm.asm +EXTRA_DIST += wolfcrypt/src/wc_mldsa_asm.asm EXTRA_DIST += wolfcrypt/src/wc_dsp.c EXTRA_DIST += wolfcrypt/src/sp_dsp32.c EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm diff --git a/wolfcrypt/src/poly1305_asm.asm b/wolfcrypt/src/poly1305_asm.asm index 95c3764acac..ae34937a184 100644 --- a/wolfcrypt/src/poly1305_asm.asm +++ b/wolfcrypt/src/poly1305_asm.asm @@ -598,16 +598,14 @@ poly1305_setkey_avx2 ENDP _TEXT ENDS _DATA SEGMENT ALIGN 16 -L_poly1305_avx2_blocks_mask QWORD \ - 0000000003ffffffh, 0000000003ffffffh, - 0000000003ffffffh, 0000000003ffffffh +L_poly1305_avx2_blocks_mask QWORD 0000000003ffffffh, 0000000003ffffffh + QWORD 0000000003ffffffh, 0000000003ffffffh ptr_L_poly1305_avx2_blocks_mask QWORD L_poly1305_avx2_blocks_mask _DATA ENDS _DATA SEGMENT ALIGN 16 -L_poly1305_avx2_blocks_hibit QWORD \ - 0000000001000000h, 0000000001000000h, - 0000000001000000h, 0000000001000000h +L_poly1305_avx2_blocks_hibit QWORD 0000000001000000h, 0000000001000000h + QWORD 0000000001000000h, 0000000001000000h ptr_L_poly1305_avx2_blocks_hibit QWORD L_poly1305_avx2_blocks_hibit _DATA ENDS _TEXT SEGMENT READONLY PARA @@ -736,15 +734,15 @@ L_poly1305_avx2_blocks_mul_5: vpaddq ymm12, ymm8, ymm12 vpaddq ymm13, ymm9, ymm13 ; Store powers of r and multiple of 5 for use in multiply. - vmovdqa YMMWORD PTR [rbx], ymm10 - vmovdqa YMMWORD PTR [rbx+32], ymm11 - vmovdqa YMMWORD PTR [rbx+64], ymm12 - vmovdqa YMMWORD PTR [rbx+96], ymm13 - vmovdqa YMMWORD PTR [rcx], ymm5 - vmovdqa YMMWORD PTR [rcx+32], ymm6 - vmovdqa YMMWORD PTR [rcx+64], ymm7 - vmovdqa YMMWORD PTR [rcx+96], ymm8 - vmovdqa YMMWORD PTR [rcx+128], ymm9 + vmovdqu YMMWORD PTR [rbx], ymm10 + vmovdqu YMMWORD PTR [rbx+32], ymm11 + vmovdqu YMMWORD PTR [rbx+64], ymm12 + vmovdqu YMMWORD PTR [rbx+96], ymm13 + vmovdqu YMMWORD PTR [rcx], ymm5 + vmovdqu YMMWORD PTR [rcx+32], ymm6 + vmovdqu YMMWORD PTR [rcx+64], ymm7 + vmovdqu YMMWORD PTR [rcx+96], ymm8 + vmovdqu YMMWORD PTR [rcx+128], ymm9 vmovdqu ymm14, YMMWORD PTR [r13] ; If not finished then loop over data cmp BYTE PTR [rdi+616], 1 diff --git a/wolfcrypt/src/sha256_asm.asm b/wolfcrypt/src/sha256_asm.asm new file mode 100644 index 00000000000..bd4aa0deb8f --- /dev/null +++ b/wolfcrypt/src/sha256_asm.asm @@ -0,0 +1,23463 @@ +; /* sha256_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +IFDEF WOLFSSL_X86_64_BUILD +_DATA SEGMENT +ALIGN 16 +L_sse2_sha256_sha_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h +ptr_L_sse2_sha256_sha_k QWORD L_sse2_sha256_sha_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_sse2_sha256_shuf_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh +ptr_L_sse2_sha256_shuf_mask QWORD L_sse2_sha256_shuf_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_SSE2_Sha PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + mov rax, QWORD PTR [ptr_L_sse2_sha256_sha_k] + movdqa xmm10, OWORD PTR L_sse2_sha256_shuf_mask + movq xmm1, QWORD PTR [rcx] + movq xmm2, QWORD PTR [rcx+8] + movhpd xmm1, QWORD PTR [rcx+16] + movhpd xmm2, QWORD PTR [rcx+24] + pshufd xmm1, xmm1, 27 + pshufd xmm2, xmm2, 27 + movdqu xmm3, OWORD PTR [rdx] + movdqu xmm4, OWORD PTR [rdx+16] + movdqu xmm5, OWORD PTR [rdx+32] + movdqu xmm6, OWORD PTR [rdx+48] + pshufb xmm3, xmm10 + movdqa xmm8, xmm1 + movdqa xmm9, xmm2 + ; Rounds: 0-3 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 4-7 + pshufb xmm4, xmm10 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+16] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 8-11 + pshufb xmm5, xmm10 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+32] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 12-15 + pshufb xmm6, xmm10 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+48] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm6 + palignr xmm7, xmm5, 4 + paddd xmm3, xmm7 + sha256msg2 xmm3, xmm6 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 16-19 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax+64] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm3 + palignr xmm7, xmm6, 4 + paddd xmm4, xmm7 + sha256msg2 xmm4, xmm3 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 20-23 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+80] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm4 + palignr xmm7, xmm3, 4 + paddd xmm5, xmm7 + sha256msg2 xmm5, xmm4 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 24-27 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+96] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm5 + palignr xmm7, xmm4, 4 + paddd xmm6, xmm7 + sha256msg2 xmm6, xmm5 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 28-31 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+112] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm6 + palignr xmm7, xmm5, 4 + paddd xmm3, xmm7 + sha256msg2 xmm3, xmm6 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 32-35 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax+128] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm3 + palignr xmm7, xmm6, 4 + paddd xmm4, xmm7 + sha256msg2 xmm4, xmm3 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 36-39 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+144] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm4 + palignr xmm7, xmm3, 4 + paddd xmm5, xmm7 + sha256msg2 xmm5, xmm4 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 40-43 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+160] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm5 + palignr xmm7, xmm4, 4 + paddd xmm6, xmm7 + sha256msg2 xmm6, xmm5 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 44-47 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+176] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm6 + palignr xmm7, xmm5, 4 + paddd xmm3, xmm7 + sha256msg2 xmm3, xmm6 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 48-51 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax+192] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm3 + palignr xmm7, xmm6, 4 + paddd xmm4, xmm7 + sha256msg2 xmm4, xmm3 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 52-63 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+208] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm4 + palignr xmm7, xmm3, 4 + paddd xmm5, xmm7 + sha256msg2 xmm5, xmm4 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+224] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm5 + palignr xmm7, xmm4, 4 + paddd xmm6, xmm7 + sha256msg2 xmm6, xmm5 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+240] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + paddd xmm1, xmm8 + paddd xmm2, xmm9 + pshufd xmm1, xmm1, 27 + pshufd xmm2, xmm2, 27 + movq QWORD PTR [rcx], xmm1 + movq QWORD PTR [rcx+8], xmm2 + movhpd QWORD PTR [rcx+16], xmm1 + movhpd QWORD PTR [rcx+24], xmm2 + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +Transform_Sha256_SSE2_Sha ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_SSE2_Sha_Len PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + mov rax, QWORD PTR [ptr_L_sse2_sha256_sha_k] + movdqa xmm10, OWORD PTR L_sse2_sha256_shuf_mask + movq xmm1, QWORD PTR [rcx] + movq xmm2, QWORD PTR [rcx+8] + movhpd xmm1, QWORD PTR [rcx+16] + movhpd xmm2, QWORD PTR [rcx+24] + pshufd xmm1, xmm1, 27 + pshufd xmm2, xmm2, 27 + ; Start of loop processing a block +L_sha256_sha_len_sse2_start: + movdqu xmm3, OWORD PTR [rdx] + movdqu xmm4, OWORD PTR [rdx+16] + movdqu xmm5, OWORD PTR [rdx+32] + movdqu xmm6, OWORD PTR [rdx+48] + pshufb xmm3, xmm10 + movdqa xmm8, xmm1 + movdqa xmm9, xmm2 + ; Rounds: 0-3 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 4-7 + pshufb xmm4, xmm10 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+16] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 8-11 + pshufb xmm5, xmm10 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+32] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 12-15 + pshufb xmm6, xmm10 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+48] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm6 + palignr xmm7, xmm5, 4 + paddd xmm3, xmm7 + sha256msg2 xmm3, xmm6 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 16-19 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax+64] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm3 + palignr xmm7, xmm6, 4 + paddd xmm4, xmm7 + sha256msg2 xmm4, xmm3 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 20-23 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+80] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm4 + palignr xmm7, xmm3, 4 + paddd xmm5, xmm7 + sha256msg2 xmm5, xmm4 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 24-27 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+96] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm5 + palignr xmm7, xmm4, 4 + paddd xmm6, xmm7 + sha256msg2 xmm6, xmm5 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 28-31 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+112] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm6 + palignr xmm7, xmm5, 4 + paddd xmm3, xmm7 + sha256msg2 xmm3, xmm6 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 32-35 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax+128] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm3 + palignr xmm7, xmm6, 4 + paddd xmm4, xmm7 + sha256msg2 xmm4, xmm3 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 36-39 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+144] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm4 + palignr xmm7, xmm3, 4 + paddd xmm5, xmm7 + sha256msg2 xmm5, xmm4 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 40-43 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+160] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm5 + palignr xmm7, xmm4, 4 + paddd xmm6, xmm7 + sha256msg2 xmm6, xmm5 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 44-47 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+176] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm6 + palignr xmm7, xmm5, 4 + paddd xmm3, xmm7 + sha256msg2 xmm3, xmm6 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 48-51 + movdqa xmm0, xmm3 + paddd xmm0, OWORD PTR [rax+192] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm3 + palignr xmm7, xmm6, 4 + paddd xmm4, xmm7 + sha256msg2 xmm4, xmm3 + pshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 52-63 + movdqa xmm0, xmm4 + paddd xmm0, OWORD PTR [rax+208] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm4 + palignr xmm7, xmm3, 4 + paddd xmm5, xmm7 + sha256msg2 xmm5, xmm4 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + movdqa xmm0, xmm5 + paddd xmm0, OWORD PTR [rax+224] + sha256rnds2 xmm2, xmm1, xmm0 + movdqa xmm7, xmm5 + palignr xmm7, xmm4, 4 + paddd xmm6, xmm7 + sha256msg2 xmm6, xmm5 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + movdqa xmm0, xmm6 + paddd xmm0, OWORD PTR [rax+240] + sha256rnds2 xmm2, xmm1, xmm0 + pshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + add rdx, 64 + sub r8d, 64 + paddd xmm1, xmm8 + paddd xmm2, xmm9 + jnz L_sha256_sha_len_sse2_start + pshufd xmm1, xmm1, 27 + pshufd xmm2, xmm2, 27 + movq QWORD PTR [rcx], xmm1 + movq QWORD PTR [rcx+8], xmm2 + movhpd QWORD PTR [rcx+16], xmm1 + movhpd QWORD PTR [rcx+24], xmm2 + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +Transform_Sha256_SSE2_Sha_Len ENDP +_TEXT ENDS +IFDEF HAVE_INTEL_AVX1 +_DATA SEGMENT +ALIGN 16 +L_avx1_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h +ptr_L_avx1_sha256_k QWORD L_avx1_sha256_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh +ptr_L_avx1_sha256_shuf_00BA QWORD L_avx1_sha256_shuf_00BA +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h +ptr_L_avx1_sha256_shuf_DC00 QWORD L_avx1_sha256_shuf_DC00 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh +ptr_L_avx1_sha256_flip_mask QWORD L_avx1_sha256_flip_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX1 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + sub rsp, 192 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + mov rbp, QWORD PTR [ptr_L_avx1_sha256_k] + vmovdqa xmm13, OWORD PTR L_avx1_sha256_flip_mask + vmovdqa xmm11, OWORD PTR L_avx1_sha256_shuf_00BA + vmovdqa xmm12, OWORD PTR L_avx1_sha256_shuf_DC00 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm13 + vpshufb xmm1, xmm1, xmm13 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm13 + vpshufb xmm3, xmm3, xmm13 + mov ebx, r9d + mov edx, r12d + xor ebx, r10d + ; set_w_k_xfer_4: 0 + vpaddd xmm4, xmm0, OWORD PTR [rbp] + vpaddd xmm5, xmm1, OWORD PTR [rbp+16] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+32] + vpaddd xmm7, xmm3, OWORD PTR [rbp+48] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm1, xmm0, 4 + vpalignr xmm4, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm2, xmm1, 4 + vpalignr xmm4, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+16] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+20] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+24] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+28] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm3, xmm2, 4 + vpalignr xmm4, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+32] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+36] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+40] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+44] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm0, xmm3, 4 + vpalignr xmm4, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+48] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+52] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+56] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+60] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 4 + vpaddd xmm4, xmm0, OWORD PTR [rbp+64] + vpaddd xmm5, xmm1, OWORD PTR [rbp+80] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+96] + vpaddd xmm7, xmm3, OWORD PTR [rbp+112] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm1, xmm0, 4 + vpalignr xmm4, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm2, xmm1, 4 + vpalignr xmm4, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+16] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+20] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+24] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+28] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm3, xmm2, 4 + vpalignr xmm4, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+32] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+36] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+40] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+44] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm0, xmm3, 4 + vpalignr xmm4, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+48] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+52] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+56] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+60] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 8 + vpaddd xmm4, xmm0, OWORD PTR [rbp+128] + vpaddd xmm5, xmm1, OWORD PTR [rbp+144] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+160] + vpaddd xmm7, xmm3, OWORD PTR [rbp+176] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm1, xmm0, 4 + vpalignr xmm4, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm2, xmm1, 4 + vpalignr xmm4, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+16] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+20] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+24] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+28] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm3, xmm2, 4 + vpalignr xmm4, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+32] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+36] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+40] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+44] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm0, xmm3, 4 + vpalignr xmm4, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+48] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+52] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+56] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+60] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 12 + vpaddd xmm4, xmm0, OWORD PTR [rbp+192] + vpaddd xmm5, xmm1, OWORD PTR [rbp+208] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+224] + vpaddd xmm7, xmm3, OWORD PTR [rbp+240] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; rnd_all_4: 0-3 + add r15d, DWORD PTR [rsp] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+4] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+8] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+12] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 1-4 + add r11d, DWORD PTR [rsp+16] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+20] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+24] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+28] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 2-5 + add r15d, DWORD PTR [rsp+32] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+36] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+40] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+44] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 3-6 + add r11d, DWORD PTR [rsp+48] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+52] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+56] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+60] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + add DWORD PTR [rdi], r8d + add DWORD PTR [rdi+4], r9d + add DWORD PTR [rdi+8], r10d + add DWORD PTR [rdi+12], r11d + add DWORD PTR [rdi+16], r12d + add DWORD PTR [rdi+20], r13d + add DWORD PTR [rdi+24], r14d + add DWORD PTR [rdi+28], r15d + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + add rsp, 192 + pop rsi + pop rdi + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX1_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rsi, rdx + mov rbp, r8 + sub rsp, 196 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + mov DWORD PTR [rsp+64], ebp + mov rbp, QWORD PTR [ptr_L_avx1_sha256_k] + vmovdqa xmm13, OWORD PTR L_avx1_sha256_flip_mask + vmovdqa xmm11, OWORD PTR L_avx1_sha256_shuf_00BA + vmovdqa xmm12, OWORD PTR L_avx1_sha256_shuf_DC00 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + ; Start of loop processing a block +L_sha256_len_avx1_start: + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm13 + vpshufb xmm1, xmm1, xmm13 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm13 + vpshufb xmm3, xmm3, xmm13 + mov ebx, r9d + mov edx, r12d + xor ebx, r10d + ; set_w_k_xfer_4: 0 + vpaddd xmm4, xmm0, OWORD PTR [rbp] + vpaddd xmm5, xmm1, OWORD PTR [rbp+16] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+32] + vpaddd xmm7, xmm3, OWORD PTR [rbp+48] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm1, xmm0, 4 + vpalignr xmm4, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm2, xmm1, 4 + vpalignr xmm4, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+16] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+20] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+24] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+28] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm3, xmm2, 4 + vpalignr xmm4, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+32] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+36] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+40] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+44] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm0, xmm3, 4 + vpalignr xmm4, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+48] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+52] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+56] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+60] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 4 + vpaddd xmm4, xmm0, OWORD PTR [rbp+64] + vpaddd xmm5, xmm1, OWORD PTR [rbp+80] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+96] + vpaddd xmm7, xmm3, OWORD PTR [rbp+112] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm1, xmm0, 4 + vpalignr xmm4, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm2, xmm1, 4 + vpalignr xmm4, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+16] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+20] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+24] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+28] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm3, xmm2, 4 + vpalignr xmm4, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+32] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+36] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+40] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+44] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm0, xmm3, 4 + vpalignr xmm4, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+48] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+52] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+56] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+60] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 8 + vpaddd xmm4, xmm0, OWORD PTR [rbp+128] + vpaddd xmm5, xmm1, OWORD PTR [rbp+144] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+160] + vpaddd xmm7, xmm3, OWORD PTR [rbp+176] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm1, xmm0, 4 + vpalignr xmm4, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm2, xmm1, 4 + vpalignr xmm4, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+16] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+20] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+24] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+28] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm3, xmm2, 4 + vpalignr xmm4, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+32] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+36] + xor ecx, r13d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+40] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+44] + xor ecx, r11d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr xmm5, xmm0, xmm3, 4 + vpalignr xmm4, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+48] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld xmm8, xmm5, 18 + vpslld xmm9, xmm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor xmm6, xmm7, xmm6 + vpor xmm8, xmm9, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+52] + xor ecx, r9d + vpsrld xmm9, xmm5, 3 + vpxor xmm6, xmm8, xmm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor xmm5, xmm9, xmm6 + vpshufd xmm6, xmm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld xmm8, xmm6, 10 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+56] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor xmm6, xmm7, xmm6 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+60] + xor ecx, r15d + vpsrlq xmm8, xmm6, 17 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld xmm9, xmm6, 10 + vpxor xmm8, xmm7, xmm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor xmm9, xmm8, xmm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 12 + vpaddd xmm4, xmm0, OWORD PTR [rbp+192] + vpaddd xmm5, xmm1, OWORD PTR [rbp+208] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+224] + vpaddd xmm7, xmm3, OWORD PTR [rbp+240] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; rnd_all_4: 0-3 + add r15d, DWORD PTR [rsp] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+4] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+8] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+12] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 1-4 + add r11d, DWORD PTR [rsp+16] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+20] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+24] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+28] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 2-5 + add r15d, DWORD PTR [rsp+32] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+36] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+40] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+44] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 3-6 + add r11d, DWORD PTR [rsp+48] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+52] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+56] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+60] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + add r8d, DWORD PTR [rdi] + add r9d, DWORD PTR [rdi+4] + add r10d, DWORD PTR [rdi+8] + add r11d, DWORD PTR [rdi+12] + add r12d, DWORD PTR [rdi+16] + add r13d, DWORD PTR [rdi+20] + add r14d, DWORD PTR [rdi+24] + add r15d, DWORD PTR [rdi+28] + add rsi, 64 + sub DWORD PTR [rsp+64], 64 + mov DWORD PTR [rdi], r8d + mov DWORD PTR [rdi+4], r9d + mov DWORD PTR [rdi+8], r10d + mov DWORD PTR [rdi+12], r11d + mov DWORD PTR [rdi+16], r12d + mov DWORD PTR [rdi+20], r13d + mov DWORD PTR [rdi+24], r14d + mov DWORD PTR [rdi+28], r15d + jnz L_sha256_len_avx1_start + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + add rsp, 196 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX1_Len ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_rorx_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h +ptr_L_avx1_rorx_sha256_k QWORD L_avx1_rorx_sha256_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_rorx_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh +ptr_L_avx1_rorx_sha256_shuf_00BA QWORD L_avx1_rorx_sha256_shuf_00BA +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_rorx_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h +ptr_L_avx1_rorx_sha256_shuf_DC00 QWORD L_avx1_rorx_sha256_shuf_DC00 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_rorx_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh +ptr_L_avx1_rorx_sha256_flip_mask QWORD L_avx1_rorx_sha256_flip_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX1_RORX PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + sub rsp, 192 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + mov rbp, QWORD PTR [ptr_L_avx1_rorx_sha256_k] + vmovdqa xmm13, OWORD PTR L_avx1_rorx_sha256_flip_mask + vmovdqa xmm11, OWORD PTR L_avx1_rorx_sha256_shuf_00BA + vmovdqa xmm12, OWORD PTR L_avx1_rorx_sha256_shuf_DC00 + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm13 + vpshufb xmm1, xmm1, xmm13 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm13 + vpshufb xmm3, xmm3, xmm13 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + ; set_w_k_xfer_4: 0 + vpaddd xmm4, xmm0, OWORD PTR [rbp] + vpaddd xmm5, xmm1, OWORD PTR [rbp+16] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+32] + vpaddd xmm7, xmm3, OWORD PTR [rbp+48] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + mov ebx, r9d + rorx edx, r12d, 6 + xor ebx, r10d + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr xmm4, xmm3, xmm2, 4 + vpalignr xmm5, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm3, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+16] + vpalignr xmm4, xmm0, xmm3, 4 + vpalignr xmm5, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+20] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm0, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+24] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+28] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+32] + vpalignr xmm4, xmm1, xmm0, 4 + vpalignr xmm5, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+36] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm1, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+40] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+44] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+48] + vpalignr xmm4, xmm2, xmm1, 4 + vpalignr xmm5, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+52] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm2, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+56] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+60] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 4 + vpaddd xmm4, xmm0, OWORD PTR [rbp+64] + vpaddd xmm5, xmm1, OWORD PTR [rbp+80] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+96] + vpaddd xmm7, xmm3, OWORD PTR [rbp+112] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr xmm4, xmm3, xmm2, 4 + vpalignr xmm5, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm3, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+16] + vpalignr xmm4, xmm0, xmm3, 4 + vpalignr xmm5, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+20] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm0, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+24] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+28] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+32] + vpalignr xmm4, xmm1, xmm0, 4 + vpalignr xmm5, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+36] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm1, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+40] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+44] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+48] + vpalignr xmm4, xmm2, xmm1, 4 + vpalignr xmm5, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+52] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm2, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+56] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+60] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 8 + vpaddd xmm4, xmm0, OWORD PTR [rbp+128] + vpaddd xmm5, xmm1, OWORD PTR [rbp+144] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+160] + vpaddd xmm7, xmm3, OWORD PTR [rbp+176] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr xmm4, xmm3, xmm2, 4 + vpalignr xmm5, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm3, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+16] + vpalignr xmm4, xmm0, xmm3, 4 + vpalignr xmm5, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+20] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm0, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+24] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+28] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+32] + vpalignr xmm4, xmm1, xmm0, 4 + vpalignr xmm5, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+36] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm1, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+40] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+44] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+48] + vpalignr xmm4, xmm2, xmm1, 4 + vpalignr xmm5, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+52] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm2, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+56] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+60] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 12 + vpaddd xmm4, xmm0, OWORD PTR [rbp+192] + vpaddd xmm5, xmm1, OWORD PTR [rbp+208] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+224] + vpaddd xmm7, xmm3, OWORD PTR [rbp+240] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + xor eax, eax + ; rnd_all_4: 0-3 + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + add r8d, eax + add r15d, DWORD PTR [rsp] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+4] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + add r10d, r14d + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + add r14d, eax + add r13d, DWORD PTR [rsp+8] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+12] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + add r8d, r12d + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + ; rnd_all_4: 1-4 + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + add r12d, eax + add r11d, DWORD PTR [rsp+16] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+20] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + add r14d, r10d + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + add r10d, eax + add r9d, DWORD PTR [rsp+24] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+28] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + add r12d, r8d + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + ; rnd_all_4: 2-5 + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + add r8d, eax + add r15d, DWORD PTR [rsp+32] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+36] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + add r10d, r14d + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + add r14d, eax + add r13d, DWORD PTR [rsp+40] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+44] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + add r8d, r12d + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + ; rnd_all_4: 3-6 + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + add r12d, eax + add r11d, DWORD PTR [rsp+48] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+52] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + add r14d, r10d + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + add r10d, eax + add r9d, DWORD PTR [rsp+56] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+60] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + add r12d, r8d + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + add r8d, eax + add DWORD PTR [rdi], r8d + add DWORD PTR [rdi+4], r9d + add DWORD PTR [rdi+8], r10d + add DWORD PTR [rdi+12], r11d + add DWORD PTR [rdi+16], r12d + add DWORD PTR [rdi+20], r13d + add DWORD PTR [rdi+24], r14d + add DWORD PTR [rdi+28], r15d + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + add rsp, 192 + pop rsi + pop rdi + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX1_RORX ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX1_RORX_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rsi, rdx + mov rbp, r8 + sub rsp, 196 + vmovdqu OWORD PTR [rsp+64], xmm6 + vmovdqu OWORD PTR [rsp+80], xmm7 + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + vmovdqu OWORD PTR [rsp+128], xmm10 + vmovdqu OWORD PTR [rsp+144], xmm11 + vmovdqu OWORD PTR [rsp+160], xmm12 + vmovdqu OWORD PTR [rsp+176], xmm13 + mov DWORD PTR [rsp+64], ebp + mov rbp, QWORD PTR [ptr_L_avx1_rorx_sha256_k] + vmovdqa xmm13, OWORD PTR L_avx1_rorx_sha256_flip_mask + vmovdqa xmm11, OWORD PTR L_avx1_rorx_sha256_shuf_00BA + vmovdqa xmm12, OWORD PTR L_avx1_rorx_sha256_shuf_DC00 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + ; Start of loop processing a block +L_sha256_len_avx1_len_rorx_start: + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm13 + vpshufb xmm1, xmm1, xmm13 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm13 + vpshufb xmm3, xmm3, xmm13 + ; set_w_k_xfer_4: 0 + vpaddd xmm4, xmm0, OWORD PTR [rbp] + vpaddd xmm5, xmm1, OWORD PTR [rbp+16] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+32] + vpaddd xmm7, xmm3, OWORD PTR [rbp+48] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + mov ebx, r9d + rorx edx, r12d, 6 + xor ebx, r10d + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr xmm4, xmm3, xmm2, 4 + vpalignr xmm5, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm3, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+16] + vpalignr xmm4, xmm0, xmm3, 4 + vpalignr xmm5, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+20] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm0, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+24] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+28] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+32] + vpalignr xmm4, xmm1, xmm0, 4 + vpalignr xmm5, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+36] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm1, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+40] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+44] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+48] + vpalignr xmm4, xmm2, xmm1, 4 + vpalignr xmm5, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+52] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm2, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+56] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+60] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 4 + vpaddd xmm4, xmm0, OWORD PTR [rbp+64] + vpaddd xmm5, xmm1, OWORD PTR [rbp+80] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+96] + vpaddd xmm7, xmm3, OWORD PTR [rbp+112] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr xmm4, xmm3, xmm2, 4 + vpalignr xmm5, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm3, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+16] + vpalignr xmm4, xmm0, xmm3, 4 + vpalignr xmm5, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+20] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm0, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+24] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+28] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+32] + vpalignr xmm4, xmm1, xmm0, 4 + vpalignr xmm5, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+36] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm1, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+40] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+44] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+48] + vpalignr xmm4, xmm2, xmm1, 4 + vpalignr xmm5, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+52] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm2, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+56] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+60] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 8 + vpaddd xmm4, xmm0, OWORD PTR [rbp+128] + vpaddd xmm5, xmm1, OWORD PTR [rbp+144] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+160] + vpaddd xmm7, xmm3, OWORD PTR [rbp+176] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr xmm4, xmm3, xmm2, 4 + vpalignr xmm5, xmm1, xmm0, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm3, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm0 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm0, xmm9, xmm4 + ; msg_sched done: 0-3 + ; msg_sched: 4-7 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+16] + vpalignr xmm4, xmm0, xmm3, 4 + vpalignr xmm5, xmm2, xmm1, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+20] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm0, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+24] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm1 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+28] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm1, xmm9, xmm4 + ; msg_sched done: 4-7 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+32] + vpalignr xmm4, xmm1, xmm0, 4 + vpalignr xmm5, xmm3, xmm2, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + add r15d, edx + and ebx, eax + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+36] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpshufd xmm6, xmm1, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r8d + add r10d, r14d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+40] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm2 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+44] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r14d + add r8d, r12d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vpaddd xmm2, xmm9, xmm4 + ; msg_sched done: 8-11 + ; msg_sched: 12-15 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+48] + vpalignr xmm4, xmm2, xmm1, 4 + vpalignr xmm5, xmm0, xmm3, 4 + ; rnd_0: 1 - 2 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld xmm6, xmm5, 7 + vpslld xmm7, xmm5, 25 + ; rnd_0: 3 - 4 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld xmm8, xmm5, 3 + vpor xmm7, xmm7, xmm6 + ; rnd_0: 5 - 7 + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + add r11d, edx + and ebx, eax + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+52] + vpsrld xmm6, xmm5, 18 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpslld xmm5, xmm5, 14 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpxor xmm7, xmm7, xmm5 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor xmm7, xmm7, xmm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpshufd xmm6, xmm2, 250 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r12d + add r14d, r10d + vpxor xmm5, xmm7, xmm8 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrld xmm8, xmm6, 10 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+56] + vpsrlq xmm7, xmm6, 19 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpsrlq xmm6, xmm6, 17 + vpaddd xmm4, xmm4, xmm3 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd xmm4, xmm4, xmm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpxor xmm6, xmm6, xmm7 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpxor xmm8, xmm8, xmm6 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufb xmm8, xmm8, xmm11 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpaddd xmm4, xmm4, xmm8 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+60] + vpshufd xmm6, xmm4, 80 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpsrld xmm9, xmm6, 10 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpsrlq xmm7, xmm6, 19 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpsrlq xmm6, xmm6, 17 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpxor xmm6, xmm6, xmm7 + ; rnd_1: 5 - 5 + xor edx, ecx + mov ebx, r10d + add r12d, r8d + vpxor xmm9, xmm9, xmm6 + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + vpshufb xmm9, xmm9, xmm12 + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vpaddd xmm3, xmm9, xmm4 + ; msg_sched done: 12-15 + ; set_w_k_xfer_4: 12 + vpaddd xmm4, xmm0, OWORD PTR [rbp+192] + vpaddd xmm5, xmm1, OWORD PTR [rbp+208] + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm5 + vpaddd xmm6, xmm2, OWORD PTR [rbp+224] + vpaddd xmm7, xmm3, OWORD PTR [rbp+240] + vmovdqu OWORD PTR [rsp+32], xmm6 + vmovdqu OWORD PTR [rsp+48], xmm7 + xor eax, eax + xor ecx, ecx + ; rnd_all_4: 0-3 + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + add r8d, eax + add r15d, DWORD PTR [rsp] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+4] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + add r10d, r14d + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + add r14d, eax + add r13d, DWORD PTR [rsp+8] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+12] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + add r8d, r12d + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + ; rnd_all_4: 1-4 + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + add r12d, eax + add r11d, DWORD PTR [rsp+16] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+20] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + add r14d, r10d + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + add r10d, eax + add r9d, DWORD PTR [rsp+24] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+28] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + add r12d, r8d + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + ; rnd_all_4: 2-5 + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + add r8d, eax + add r15d, DWORD PTR [rsp+32] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+36] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + add r10d, r14d + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + add r14d, eax + add r13d, DWORD PTR [rsp+40] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+44] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + add r8d, r12d + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + ; rnd_all_4: 3-6 + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + add r12d, eax + add r11d, DWORD PTR [rsp+48] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+52] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + add r14d, r10d + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + add r10d, eax + add r9d, DWORD PTR [rsp+56] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+60] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + add r12d, r8d + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + add r8d, eax + add r8d, DWORD PTR [rdi] + add r9d, DWORD PTR [rdi+4] + add r10d, DWORD PTR [rdi+8] + add r11d, DWORD PTR [rdi+12] + add r12d, DWORD PTR [rdi+16] + add r13d, DWORD PTR [rdi+20] + add r14d, DWORD PTR [rdi+24] + add r15d, DWORD PTR [rdi+28] + add rsi, 64 + sub DWORD PTR [rsp+64], 64 + mov DWORD PTR [rdi], r8d + mov DWORD PTR [rdi+4], r9d + mov DWORD PTR [rdi+8], r10d + mov DWORD PTR [rdi+12], r11d + mov DWORD PTR [rdi+16], r12d + mov DWORD PTR [rdi+20], r13d + mov DWORD PTR [rdi+24], r14d + mov DWORD PTR [rdi+28], r15d + jnz L_sha256_len_avx1_len_rorx_start + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm8, OWORD PTR [rsp+96] + vmovdqu xmm9, OWORD PTR [rsp+112] + vmovdqu xmm10, OWORD PTR [rsp+128] + vmovdqu xmm11, OWORD PTR [rsp+144] + vmovdqu xmm12, OWORD PTR [rsp+160] + vmovdqu xmm13, OWORD PTR [rsp+176] + add rsp, 196 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX1_RORX_Len ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_sha256_sha_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h +ptr_L_avx1_sha256_sha_k QWORD L_avx1_sha256_sha_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_sha256_shuf_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh +ptr_L_avx1_sha256_shuf_mask QWORD L_avx1_sha256_shuf_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX1_Sha PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + mov rax, QWORD PTR [ptr_L_avx1_sha256_sha_k] + vmovdqa xmm10, OWORD PTR L_avx1_sha256_shuf_mask + vmovq xmm1, QWORD PTR [rcx] + vmovq xmm2, QWORD PTR [rcx+8] + vmovhpd xmm1, xmm1, QWORD PTR [rcx+16] + vmovhpd xmm2, xmm2, QWORD PTR [rcx+24] + vpshufd xmm1, xmm1, 27 + vpshufd xmm2, xmm2, 27 + vmovdqu xmm3, OWORD PTR [rdx] + vmovdqu xmm4, OWORD PTR [rdx+16] + vmovdqu xmm5, OWORD PTR [rdx+32] + vmovdqu xmm6, OWORD PTR [rdx+48] + vpshufb xmm3, xmm3, xmm10 + vmovdqa xmm8, xmm1 + vmovdqa xmm9, xmm2 + ; Rounds: 0-3 + vpaddd xmm0, xmm3, OWORD PTR [rax] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 4-7 + vpshufb xmm4, xmm4, xmm10 + vpaddd xmm0, xmm4, OWORD PTR [rax+16] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 8-11 + vpshufb xmm5, xmm5, xmm10 + vpaddd xmm0, xmm5, OWORD PTR [rax+32] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 12-15 + vpshufb xmm6, xmm6, xmm10 + vpaddd xmm0, xmm6, OWORD PTR [rax+48] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm6, xmm5, 4 + vpaddd xmm3, xmm3, xmm7 + sha256msg2 xmm3, xmm6 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 16-19 + vpaddd xmm0, xmm3, OWORD PTR [rax+64] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm3, xmm6, 4 + vpaddd xmm4, xmm4, xmm7 + sha256msg2 xmm4, xmm3 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 20-23 + vpaddd xmm0, xmm4, OWORD PTR [rax+80] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm4, xmm3, 4 + vpaddd xmm5, xmm5, xmm7 + sha256msg2 xmm5, xmm4 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 24-27 + vpaddd xmm0, xmm5, OWORD PTR [rax+96] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm5, xmm4, 4 + vpaddd xmm6, xmm6, xmm7 + sha256msg2 xmm6, xmm5 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 28-31 + vpaddd xmm0, xmm6, OWORD PTR [rax+112] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm6, xmm5, 4 + vpaddd xmm3, xmm3, xmm7 + sha256msg2 xmm3, xmm6 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 32-35 + vpaddd xmm0, xmm3, OWORD PTR [rax+128] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm3, xmm6, 4 + vpaddd xmm4, xmm4, xmm7 + sha256msg2 xmm4, xmm3 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 36-39 + vpaddd xmm0, xmm4, OWORD PTR [rax+144] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm4, xmm3, 4 + vpaddd xmm5, xmm5, xmm7 + sha256msg2 xmm5, xmm4 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 40-43 + vpaddd xmm0, xmm5, OWORD PTR [rax+160] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm5, xmm4, 4 + vpaddd xmm6, xmm6, xmm7 + sha256msg2 xmm6, xmm5 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 44-47 + vpaddd xmm0, xmm6, OWORD PTR [rax+176] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm6, xmm5, 4 + vpaddd xmm3, xmm3, xmm7 + sha256msg2 xmm3, xmm6 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 48-51 + vpaddd xmm0, xmm3, OWORD PTR [rax+192] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm3, xmm6, 4 + vpaddd xmm4, xmm4, xmm7 + sha256msg2 xmm4, xmm3 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 52-63 + vpaddd xmm0, xmm4, OWORD PTR [rax+208] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm4, xmm3, 4 + vpaddd xmm5, xmm5, xmm7 + sha256msg2 xmm5, xmm4 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + vpaddd xmm0, xmm5, OWORD PTR [rax+224] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm5, xmm4, 4 + vpaddd xmm6, xmm6, xmm7 + sha256msg2 xmm6, xmm5 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + vpaddd xmm0, xmm6, OWORD PTR [rax+240] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + vpaddd xmm1, xmm1, xmm8 + vpaddd xmm2, xmm2, xmm9 + vpshufd xmm1, xmm1, 27 + vpshufd xmm2, xmm2, 27 + vmovq QWORD PTR [rcx], xmm1 + vmovq QWORD PTR [rcx+8], xmm2 + vmovhpd QWORD PTR [rcx+16], xmm1 + vmovhpd QWORD PTR [rcx+24], xmm2 + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +Transform_Sha256_AVX1_Sha ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX1_Sha_Len PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + mov rax, QWORD PTR [ptr_L_avx1_sha256_sha_k] + vmovdqa xmm10, OWORD PTR L_avx1_sha256_shuf_mask + vmovq xmm1, QWORD PTR [rcx] + vmovq xmm2, QWORD PTR [rcx+8] + vmovhpd xmm1, xmm1, QWORD PTR [rcx+16] + vmovhpd xmm2, xmm2, QWORD PTR [rcx+24] + vpshufd xmm1, xmm1, 27 + vpshufd xmm2, xmm2, 27 + ; Start of loop processing a block +L_sha256_sha_len_avx1_start: + vmovdqu xmm3, OWORD PTR [rdx] + vmovdqu xmm4, OWORD PTR [rdx+16] + vmovdqu xmm5, OWORD PTR [rdx+32] + vmovdqu xmm6, OWORD PTR [rdx+48] + vpshufb xmm3, xmm3, xmm10 + vmovdqa xmm8, xmm1 + vmovdqa xmm9, xmm2 + ; Rounds: 0-3 + vpaddd xmm0, xmm3, OWORD PTR [rax] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 4-7 + vpshufb xmm4, xmm4, xmm10 + vpaddd xmm0, xmm4, OWORD PTR [rax+16] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 8-11 + vpshufb xmm5, xmm5, xmm10 + vpaddd xmm0, xmm5, OWORD PTR [rax+32] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 12-15 + vpshufb xmm6, xmm6, xmm10 + vpaddd xmm0, xmm6, OWORD PTR [rax+48] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm6, xmm5, 4 + vpaddd xmm3, xmm3, xmm7 + sha256msg2 xmm3, xmm6 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 16-19 + vpaddd xmm0, xmm3, OWORD PTR [rax+64] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm3, xmm6, 4 + vpaddd xmm4, xmm4, xmm7 + sha256msg2 xmm4, xmm3 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 20-23 + vpaddd xmm0, xmm4, OWORD PTR [rax+80] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm4, xmm3, 4 + vpaddd xmm5, xmm5, xmm7 + sha256msg2 xmm5, xmm4 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 24-27 + vpaddd xmm0, xmm5, OWORD PTR [rax+96] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm5, xmm4, 4 + vpaddd xmm6, xmm6, xmm7 + sha256msg2 xmm6, xmm5 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 28-31 + vpaddd xmm0, xmm6, OWORD PTR [rax+112] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm6, xmm5, 4 + vpaddd xmm3, xmm3, xmm7 + sha256msg2 xmm3, xmm6 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 32-35 + vpaddd xmm0, xmm3, OWORD PTR [rax+128] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm3, xmm6, 4 + vpaddd xmm4, xmm4, xmm7 + sha256msg2 xmm4, xmm3 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 36-39 + vpaddd xmm0, xmm4, OWORD PTR [rax+144] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm4, xmm3, 4 + vpaddd xmm5, xmm5, xmm7 + sha256msg2 xmm5, xmm4 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm3, xmm4 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 40-43 + vpaddd xmm0, xmm5, OWORD PTR [rax+160] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm5, xmm4, 4 + vpaddd xmm6, xmm6, xmm7 + sha256msg2 xmm6, xmm5 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm4, xmm5 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 44-47 + vpaddd xmm0, xmm6, OWORD PTR [rax+176] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm6, xmm5, 4 + vpaddd xmm3, xmm3, xmm7 + sha256msg2 xmm3, xmm6 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm5, xmm6 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 48-51 + vpaddd xmm0, xmm3, OWORD PTR [rax+192] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm3, xmm6, 4 + vpaddd xmm4, xmm4, xmm7 + sha256msg2 xmm4, xmm3 + vpshufd xmm0, xmm0, 14 + sha256msg1 xmm6, xmm3 + sha256rnds2 xmm1, xmm2, xmm0 + ; Rounds: 52-63 + vpaddd xmm0, xmm4, OWORD PTR [rax+208] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm4, xmm3, 4 + vpaddd xmm5, xmm5, xmm7 + sha256msg2 xmm5, xmm4 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + vpaddd xmm0, xmm5, OWORD PTR [rax+224] + sha256rnds2 xmm2, xmm1, xmm0 + vpalignr xmm7, xmm5, xmm4, 4 + vpaddd xmm6, xmm6, xmm7 + sha256msg2 xmm6, xmm5 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + vpaddd xmm0, xmm6, OWORD PTR [rax+240] + sha256rnds2 xmm2, xmm1, xmm0 + vpshufd xmm0, xmm0, 14 + sha256rnds2 xmm1, xmm2, xmm0 + add rdx, 64 + sub r8d, 64 + vpaddd xmm1, xmm1, xmm8 + vpaddd xmm2, xmm2, xmm9 + jnz L_sha256_sha_len_avx1_start + vpshufd xmm1, xmm1, 27 + vpshufd xmm2, xmm2, 27 + vmovq QWORD PTR [rcx], xmm1 + vmovq QWORD PTR [rcx+8], xmm2 + vmovhpd QWORD PTR [rcx+16], xmm1 + vmovhpd QWORD PTR [rcx+24], xmm2 + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +Transform_Sha256_AVX1_Sha_Len ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX2 +_DATA SEGMENT +ALIGN 16 +L_avx2_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h +ptr_L_avx2_sha256_k QWORD L_avx2_sha256_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh + QWORD 0b0a090803020100h, 0ffffffffffffffffh +ptr_L_avx2_sha256_shuf_00BA QWORD L_avx2_sha256_shuf_00BA +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h + QWORD 0ffffffffffffffffh, 0b0a090803020100h +ptr_L_avx2_sha256_shuf_DC00 QWORD L_avx2_sha256_shuf_DC00 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh + QWORD 0405060700010203h, 0c0d0e0f08090a0bh +ptr_L_avx2_sha256_flip_mask QWORD L_avx2_sha256_flip_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + sub rsp, 640 + vmovdqu OWORD PTR [rsp+512], xmm6 + vmovdqu OWORD PTR [rsp+528], xmm7 + vmovdqu OWORD PTR [rsp+544], xmm8 + vmovdqu OWORD PTR [rsp+560], xmm9 + vmovdqu OWORD PTR [rsp+576], xmm10 + vmovdqu OWORD PTR [rsp+592], xmm11 + vmovdqu OWORD PTR [rsp+608], xmm12 + vmovdqu OWORD PTR [rsp+624], xmm13 + mov rbp, QWORD PTR [ptr_L_avx2_sha256_k] + vmovdqa xmm13, OWORD PTR L_avx2_sha256_flip_mask + vmovdqu ymm11, YMMWORD PTR L_avx2_sha256_shuf_00BA + vmovdqu ymm12, YMMWORD PTR L_avx2_sha256_shuf_DC00 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm13 + vpshufb xmm1, xmm1, xmm13 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm13 + vpshufb xmm3, xmm3, xmm13 + mov ebx, r9d + mov edx, r12d + xor ebx, r10d + ; set_w_k_xfer_4: 0 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32] + vmovdqu YMMWORD PTR [rsp], ymm4 + vmovdqu YMMWORD PTR [rsp+32], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96] + vmovdqu YMMWORD PTR [rsp+64], ymm4 + vmovdqu YMMWORD PTR [rsp+96], ymm5 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm1, ymm0, 4 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm0, ymm9, ymm4 + ; msg_sched done: 0-3 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm2, ymm1, 4 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+32] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+36] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+40] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+44] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm1, ymm9, ymm4 + ; msg_sched done: 8-11 + ; msg_sched: 16-19 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm3, ymm2, 4 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+64] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+68] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+72] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+76] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm2, ymm9, ymm4 + ; msg_sched done: 16-19 + ; msg_sched: 24-27 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm0, ymm3, 4 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+96] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+100] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+104] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+108] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm3, ymm9, ymm4 + ; msg_sched done: 24-27 + ; set_w_k_xfer_4: 4 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+160] + vmovdqu YMMWORD PTR [rsp+128], ymm4 + vmovdqu YMMWORD PTR [rsp+160], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+224] + vmovdqu YMMWORD PTR [rsp+192], ymm4 + vmovdqu YMMWORD PTR [rsp+224], ymm5 + ; msg_sched: 32-35 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm1, ymm0, 4 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+128] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+132] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+136] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+140] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm0, ymm9, ymm4 + ; msg_sched done: 32-35 + ; msg_sched: 40-43 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm2, ymm1, 4 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+160] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+164] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+168] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+172] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm1, ymm9, ymm4 + ; msg_sched done: 40-43 + ; msg_sched: 48-51 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm3, ymm2, 4 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+192] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+196] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+200] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+204] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm2, ymm9, ymm4 + ; msg_sched done: 48-51 + ; msg_sched: 56-59 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm0, ymm3, 4 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+224] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+228] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+232] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+236] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm3, ymm9, ymm4 + ; msg_sched done: 56-59 + ; set_w_k_xfer_4: 8 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+288] + vmovdqu YMMWORD PTR [rsp+256], ymm4 + vmovdqu YMMWORD PTR [rsp+288], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+352] + vmovdqu YMMWORD PTR [rsp+320], ymm4 + vmovdqu YMMWORD PTR [rsp+352], ymm5 + ; msg_sched: 64-67 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm1, ymm0, 4 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+256] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+260] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+264] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+268] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm0, ymm9, ymm4 + ; msg_sched done: 64-67 + ; msg_sched: 72-75 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm2, ymm1, 4 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+288] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+292] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+296] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+300] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm1, ymm9, ymm4 + ; msg_sched done: 72-75 + ; msg_sched: 80-83 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm3, ymm2, 4 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+320] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+324] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+328] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+332] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm2, ymm9, ymm4 + ; msg_sched done: 80-83 + ; msg_sched: 88-91 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm0, ymm3, 4 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+352] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+356] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+360] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+364] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm3, ymm9, ymm4 + ; msg_sched done: 88-91 + ; set_w_k_xfer_4: 12 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+416] + vmovdqu YMMWORD PTR [rsp+384], ymm4 + vmovdqu YMMWORD PTR [rsp+416], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+480] + vmovdqu YMMWORD PTR [rsp+448], ymm4 + vmovdqu YMMWORD PTR [rsp+480], ymm5 + ; rnd_all_4: 24-27 + add r15d, DWORD PTR [rsp+384] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+388] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+392] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+396] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 26-29 + add r11d, DWORD PTR [rsp+416] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+420] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+424] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+428] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 28-31 + add r15d, DWORD PTR [rsp+448] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+452] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+456] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+460] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 30-33 + add r11d, DWORD PTR [rsp+480] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+484] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+488] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+492] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + add DWORD PTR [rdi], r8d + add DWORD PTR [rdi+4], r9d + add DWORD PTR [rdi+8], r10d + add DWORD PTR [rdi+12], r11d + add DWORD PTR [rdi+16], r12d + add DWORD PTR [rdi+20], r13d + add DWORD PTR [rdi+24], r14d + add DWORD PTR [rdi+28], r15d + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+512] + vmovdqu xmm7, OWORD PTR [rsp+528] + vmovdqu xmm8, OWORD PTR [rsp+544] + vmovdqu xmm9, OWORD PTR [rsp+560] + vmovdqu xmm10, OWORD PTR [rsp+576] + vmovdqu xmm11, OWORD PTR [rsp+592] + vmovdqu xmm12, OWORD PTR [rsp+608] + vmovdqu xmm13, OWORD PTR [rsp+624] + add rsp, 640 + pop rsi + pop rdi + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX2_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rsi, rdx + mov rbp, r8 + sub rsp, 644 + vmovdqu OWORD PTR [rsp+512], xmm6 + vmovdqu OWORD PTR [rsp+528], xmm7 + vmovdqu OWORD PTR [rsp+544], xmm8 + vmovdqu OWORD PTR [rsp+560], xmm9 + vmovdqu OWORD PTR [rsp+576], xmm10 + vmovdqu OWORD PTR [rsp+592], xmm11 + vmovdqu OWORD PTR [rsp+608], xmm12 + vmovdqu OWORD PTR [rsp+624], xmm13 + test bpl, 64 + mov DWORD PTR [rsp+512], ebp + je L_sha256_len_avx2_block + vmovdqu ymm0, YMMWORD PTR [rsi] + vmovdqu ymm1, YMMWORD PTR [rsi+32] + vmovups YMMWORD PTR [rdi+32], ymm0 + vmovups YMMWORD PTR [rdi+64], ymm1 + call Transform_Sha256_AVX2 + add rsi, 64 + sub DWORD PTR [rsp+512], 64 + jz L_sha256_len_avx2_done +L_sha256_len_avx2_block: + mov rbp, QWORD PTR [ptr_L_avx2_sha256_k] + vmovdqu ymm13, YMMWORD PTR L_avx2_sha256_flip_mask + vmovdqu ymm11, YMMWORD PTR L_avx2_sha256_shuf_00BA + vmovdqu ymm12, YMMWORD PTR L_avx2_sha256_shuf_DC00 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + ; Start of loop processing two blocks +L_sha256_len_avx2_start: + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vmovdqu xmm4, OWORD PTR [rsi+64] + vmovdqu xmm5, OWORD PTR [rsi+80] + vinserti128 ymm0, ymm0, xmm4, 1 + vinserti128 ymm1, ymm1, xmm5, 1 + vpshufb ymm0, ymm0, ymm13 + vpshufb ymm1, ymm1, ymm13 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vmovdqu xmm6, OWORD PTR [rsi+96] + vmovdqu xmm7, OWORD PTR [rsi+112] + vinserti128 ymm2, ymm2, xmm6, 1 + vinserti128 ymm3, ymm3, xmm7, 1 + vpshufb ymm2, ymm2, ymm13 + vpshufb ymm3, ymm3, ymm13 + mov ebx, r9d + mov edx, r12d + xor ebx, r10d + ; set_w_k_xfer_4: 0 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32] + vmovdqu YMMWORD PTR [rsp], ymm4 + vmovdqu YMMWORD PTR [rsp+32], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96] + vmovdqu YMMWORD PTR [rsp+64], ymm4 + vmovdqu YMMWORD PTR [rsp+96], ymm5 + ; msg_sched: 0-3 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm1, ymm0, 4 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+4] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+8] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+12] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm0, ymm9, ymm4 + ; msg_sched done: 0-3 + ; msg_sched: 8-11 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm2, ymm1, 4 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+32] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+36] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+40] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+44] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm1, ymm9, ymm4 + ; msg_sched done: 8-11 + ; msg_sched: 16-19 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm3, ymm2, 4 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+64] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+68] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+72] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+76] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm2, ymm9, ymm4 + ; msg_sched done: 16-19 + ; msg_sched: 24-27 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm0, ymm3, 4 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+96] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+100] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+104] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+108] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm3, ymm9, ymm4 + ; msg_sched done: 24-27 + ; set_w_k_xfer_4: 4 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+160] + vmovdqu YMMWORD PTR [rsp+128], ymm4 + vmovdqu YMMWORD PTR [rsp+160], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+224] + vmovdqu YMMWORD PTR [rsp+192], ymm4 + vmovdqu YMMWORD PTR [rsp+224], ymm5 + ; msg_sched: 32-35 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm1, ymm0, 4 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+128] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+132] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+136] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+140] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm0, ymm9, ymm4 + ; msg_sched done: 32-35 + ; msg_sched: 40-43 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm2, ymm1, 4 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+160] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+164] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+168] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+172] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm1, ymm9, ymm4 + ; msg_sched done: 40-43 + ; msg_sched: 48-51 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm3, ymm2, 4 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+192] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+196] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+200] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+204] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm2, ymm9, ymm4 + ; msg_sched done: 48-51 + ; msg_sched: 56-59 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm0, ymm3, 4 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+224] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+228] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+232] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+236] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm3, ymm9, ymm4 + ; msg_sched done: 56-59 + ; set_w_k_xfer_4: 8 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+288] + vmovdqu YMMWORD PTR [rsp+256], ymm4 + vmovdqu YMMWORD PTR [rsp+288], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+352] + vmovdqu YMMWORD PTR [rsp+320], ymm4 + vmovdqu YMMWORD PTR [rsp+352], ymm5 + ; msg_sched: 64-67 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm1, ymm0, 4 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+256] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+260] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm3, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+264] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+268] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm0, ymm9, ymm4 + ; msg_sched done: 64-67 + ; msg_sched: 72-75 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm2, ymm1, 4 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+288] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+292] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm0, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+296] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+300] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm1, ymm9, ymm4 + ; msg_sched done: 72-75 + ; msg_sched: 80-83 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm3, ymm2, 4 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 1 - 2 + mov eax, r9d + mov ecx, r13d + add r15d, DWORD PTR [rsp+320] + xor ecx, r14d + xor edx, r12d + and ecx, r12d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r14d + xor edx, r12d + add r15d, ecx + ror edx, 6 + xor eax, r8d + add r15d, edx + mov ecx, r8d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r11d + add r15d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r8d + mov ecx, r12d + add r14d, DWORD PTR [rsp+324] + xor ecx, r13d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r11d + and ecx, r11d + ror edx, 5 + xor ecx, r13d + xor edx, r11d + add r14d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm1, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r15d + add r14d, edx + mov ecx, r15d + and eax, ebx + ror ecx, 9 + xor ecx, r15d + xor eax, r8d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 1 - 3 + mov eax, r15d + mov ecx, r11d + add r13d, DWORD PTR [rsp+328] + xor ecx, r12d + xor edx, r10d + and ecx, r10d + ror edx, 5 + xor ecx, r12d + xor edx, r10d + add r13d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r14d + add r13d, edx + mov ecx, r14d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r14d + xor ebx, r15d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r9d + add r13d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r14d + mov ecx, r10d + add r12d, DWORD PTR [rsp+332] + xor ecx, r11d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r9d + and ecx, r9d + ror edx, 5 + xor ecx, r11d + xor edx, r9d + add r12d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r13d + add r12d, edx + mov ecx, r13d + and eax, ebx + ror ecx, 9 + xor ecx, r13d + xor eax, r14d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r8d + add r12d, ecx + vpaddd ymm2, ymm9, ymm4 + ; msg_sched done: 80-83 + ; msg_sched: 88-91 + ; rnd_0: 0 - 0 + ror edx, 14 + vpalignr ymm5, ymm0, ymm3, 4 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 1 - 2 + mov eax, r13d + mov ecx, r9d + add r11d, DWORD PTR [rsp+352] + xor ecx, r10d + xor edx, r8d + and ecx, r8d + vpsrld ymm6, ymm5, 7 + vpslld ymm7, ymm5, 25 + ; rnd_0: 3 - 4 + ror edx, 5 + xor ecx, r10d + xor edx, r8d + add r11d, ecx + ror edx, 6 + xor eax, r12d + add r11d, edx + mov ecx, r12d + vpsrld ymm8, ymm5, 18 + vpslld ymm9, ymm5, 14 + ; rnd_0: 5 - 6 + and ebx, eax + ror ecx, 9 + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + vpor ymm6, ymm7, ymm6 + vpor ymm8, ymm9, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r15d + add r11d, ecx + ; rnd_1: 0 - 1 + ror edx, 14 + mov ebx, r12d + mov ecx, r8d + add r10d, DWORD PTR [rsp+356] + xor ecx, r9d + vpsrld ymm9, ymm5, 3 + vpxor ymm6, ymm8, ymm6 + ; rnd_1: 2 - 3 + xor edx, r15d + and ecx, r15d + ror edx, 5 + xor ecx, r9d + xor edx, r15d + add r10d, ecx + vpxor ymm5, ymm9, ymm6 + vpshufd ymm6, ymm2, 250 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r11d + add r10d, edx + mov ecx, r11d + and eax, ebx + ror ecx, 9 + xor ecx, r11d + xor eax, r12d + vpsrld ymm8, ymm6, 10 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 6 - 7 + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + ; rnd_0: 0 - 0 + ror edx, 14 + vpsrlq ymm6, ymm6, 17 + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 1 - 3 + mov eax, r11d + mov ecx, r15d + add r9d, DWORD PTR [rsp+360] + xor ecx, r8d + xor edx, r14d + and ecx, r14d + ror edx, 5 + xor ecx, r8d + xor edx, r14d + add r9d, ecx + vpxor ymm6, ymm7, ymm6 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 4 - 4 + ror edx, 6 + xor eax, r10d + add r9d, edx + mov ecx, r10d + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 5 - 5 + and ebx, eax + ror ecx, 9 + xor ecx, r10d + xor ebx, r11d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 6 - 6 + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 7 - 7 + ror ecx, 2 + mov edx, r13d + add r9d, ecx + ; rnd_1: 0 - 0 + ror edx, 14 + vpshufd ymm6, ymm4, 80 + ; rnd_1: 1 - 1 + mov ebx, r10d + mov ecx, r14d + add r8d, DWORD PTR [rsp+364] + xor ecx, r15d + vpsrlq ymm8, ymm6, 17 + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 2 - 3 + xor edx, r13d + and ecx, r13d + ror edx, 5 + xor ecx, r15d + xor edx, r13d + add r8d, ecx + vpsrld ymm9, ymm6, 10 + vpxor ymm8, ymm7, ymm8 + ; rnd_1: 4 - 5 + ror edx, 6 + xor ebx, r9d + add r8d, edx + mov ecx, r9d + and eax, ebx + ror ecx, 9 + xor ecx, r9d + xor eax, r10d + vpxor ymm9, ymm8, ymm9 + ; rnd_1: 6 - 6 + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 7 - 7 + ror ecx, 2 + mov edx, r12d + add r8d, ecx + vpaddd ymm3, ymm9, ymm4 + ; msg_sched done: 88-91 + ; set_w_k_xfer_4: 12 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+416] + vmovdqu YMMWORD PTR [rsp+384], ymm4 + vmovdqu YMMWORD PTR [rsp+416], ymm5 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+480] + vmovdqu YMMWORD PTR [rsp+448], ymm4 + vmovdqu YMMWORD PTR [rsp+480], ymm5 + ; rnd_all_4: 24-27 + add r15d, DWORD PTR [rsp+384] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+388] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+392] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+396] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 26-29 + add r11d, DWORD PTR [rsp+416] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+420] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+424] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+428] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 28-31 + add r15d, DWORD PTR [rsp+448] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+452] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+456] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+460] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 30-33 + add r11d, DWORD PTR [rsp+480] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+484] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+488] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+492] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + add r8d, DWORD PTR [rdi] + add r9d, DWORD PTR [rdi+4] + add r10d, DWORD PTR [rdi+8] + add r11d, DWORD PTR [rdi+12] + add r12d, DWORD PTR [rdi+16] + add r13d, DWORD PTR [rdi+20] + add r14d, DWORD PTR [rdi+24] + add r15d, DWORD PTR [rdi+28] + mov DWORD PTR [rdi], r8d + mov DWORD PTR [rdi+4], r9d + mov DWORD PTR [rdi+8], r10d + mov DWORD PTR [rdi+12], r11d + mov DWORD PTR [rdi+16], r12d + mov DWORD PTR [rdi+20], r13d + mov DWORD PTR [rdi+24], r14d + mov DWORD PTR [rdi+28], r15d + mov ebx, r9d + mov edx, r12d + xor ebx, r10d + ; rnd_all_4: 1-4 + add r15d, DWORD PTR [rsp+16] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+20] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+24] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+28] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 3-6 + add r11d, DWORD PTR [rsp+48] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+52] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+56] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+60] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 5-8 + add r15d, DWORD PTR [rsp+80] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+84] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+88] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+92] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 7-10 + add r11d, DWORD PTR [rsp+112] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+116] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+120] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+124] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 9-12 + add r15d, DWORD PTR [rsp+144] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+148] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+152] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+156] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 11-14 + add r11d, DWORD PTR [rsp+176] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+180] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+184] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+188] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 13-16 + add r15d, DWORD PTR [rsp+208] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+212] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+216] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+220] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 15-18 + add r11d, DWORD PTR [rsp+240] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+244] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+248] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+252] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 17-20 + add r15d, DWORD PTR [rsp+272] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+276] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+280] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+284] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 19-22 + add r11d, DWORD PTR [rsp+304] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+308] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+312] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+316] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 21-24 + add r15d, DWORD PTR [rsp+336] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+340] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+344] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+348] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 23-26 + add r11d, DWORD PTR [rsp+368] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+372] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+376] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+380] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 25-28 + add r15d, DWORD PTR [rsp+400] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+404] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+408] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+412] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 27-30 + add r11d, DWORD PTR [rsp+432] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+436] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+440] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+444] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + ; rnd_all_4: 29-32 + add r15d, DWORD PTR [rsp+464] + mov ecx, r13d + mov eax, r9d + xor ecx, r14d + ror edx, 14 + and ecx, r12d + xor edx, r12d + xor ecx, r14d + ror edx, 5 + add r15d, ecx + xor edx, r12d + xor eax, r8d + ror edx, 6 + mov ecx, r8d + add r15d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r8d + xor ebx, r9d + ror ecx, 11 + add r11d, r15d + xor ecx, r8d + add r15d, ebx + ror ecx, 2 + mov edx, r11d + add r15d, ecx + add r14d, DWORD PTR [rsp+468] + mov ecx, r12d + mov ebx, r8d + xor ecx, r13d + ror edx, 14 + and ecx, r11d + xor edx, r11d + xor ecx, r13d + ror edx, 5 + add r14d, ecx + xor edx, r11d + xor ebx, r15d + ror edx, 6 + mov ecx, r15d + add r14d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r15d + xor eax, r8d + ror ecx, 11 + add r10d, r14d + xor ecx, r15d + add r14d, eax + ror ecx, 2 + mov edx, r10d + add r14d, ecx + add r13d, DWORD PTR [rsp+472] + mov ecx, r11d + mov eax, r15d + xor ecx, r12d + ror edx, 14 + and ecx, r10d + xor edx, r10d + xor ecx, r12d + ror edx, 5 + add r13d, ecx + xor edx, r10d + xor eax, r14d + ror edx, 6 + mov ecx, r14d + add r13d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r14d + xor ebx, r15d + ror ecx, 11 + add r9d, r13d + xor ecx, r14d + add r13d, ebx + ror ecx, 2 + mov edx, r9d + add r13d, ecx + add r12d, DWORD PTR [rsp+476] + mov ecx, r10d + mov ebx, r14d + xor ecx, r11d + ror edx, 14 + and ecx, r9d + xor edx, r9d + xor ecx, r11d + ror edx, 5 + add r12d, ecx + xor edx, r9d + xor ebx, r13d + ror edx, 6 + mov ecx, r13d + add r12d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r13d + xor eax, r14d + ror ecx, 11 + add r8d, r12d + xor ecx, r13d + add r12d, eax + ror ecx, 2 + mov edx, r8d + add r12d, ecx + ; rnd_all_4: 31-34 + add r11d, DWORD PTR [rsp+496] + mov ecx, r9d + mov eax, r13d + xor ecx, r10d + ror edx, 14 + and ecx, r8d + xor edx, r8d + xor ecx, r10d + ror edx, 5 + add r11d, ecx + xor edx, r8d + xor eax, r12d + ror edx, 6 + mov ecx, r12d + add r11d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r12d + xor ebx, r13d + ror ecx, 11 + add r15d, r11d + xor ecx, r12d + add r11d, ebx + ror ecx, 2 + mov edx, r15d + add r11d, ecx + add r10d, DWORD PTR [rsp+500] + mov ecx, r8d + mov ebx, r12d + xor ecx, r9d + ror edx, 14 + and ecx, r15d + xor edx, r15d + xor ecx, r9d + ror edx, 5 + add r10d, ecx + xor edx, r15d + xor ebx, r11d + ror edx, 6 + mov ecx, r11d + add r10d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r11d + xor eax, r12d + ror ecx, 11 + add r14d, r10d + xor ecx, r11d + add r10d, eax + ror ecx, 2 + mov edx, r14d + add r10d, ecx + add r9d, DWORD PTR [rsp+504] + mov ecx, r15d + mov eax, r11d + xor ecx, r8d + ror edx, 14 + and ecx, r14d + xor edx, r14d + xor ecx, r8d + ror edx, 5 + add r9d, ecx + xor edx, r14d + xor eax, r10d + ror edx, 6 + mov ecx, r10d + add r9d, edx + ror ecx, 9 + and ebx, eax + xor ecx, r10d + xor ebx, r11d + ror ecx, 11 + add r13d, r9d + xor ecx, r10d + add r9d, ebx + ror ecx, 2 + mov edx, r13d + add r9d, ecx + add r8d, DWORD PTR [rsp+508] + mov ecx, r14d + mov ebx, r10d + xor ecx, r15d + ror edx, 14 + and ecx, r13d + xor edx, r13d + xor ecx, r15d + ror edx, 5 + add r8d, ecx + xor edx, r13d + xor ebx, r9d + ror edx, 6 + mov ecx, r9d + add r8d, edx + ror ecx, 9 + and eax, ebx + xor ecx, r9d + xor eax, r10d + ror ecx, 11 + add r12d, r8d + xor ecx, r9d + add r8d, eax + ror ecx, 2 + mov edx, r12d + add r8d, ecx + add r8d, DWORD PTR [rdi] + add r9d, DWORD PTR [rdi+4] + add r10d, DWORD PTR [rdi+8] + add r11d, DWORD PTR [rdi+12] + add r12d, DWORD PTR [rdi+16] + add r13d, DWORD PTR [rdi+20] + add r14d, DWORD PTR [rdi+24] + add r15d, DWORD PTR [rdi+28] + add rsi, 128 + sub DWORD PTR [rsp+512], 128 + mov DWORD PTR [rdi], r8d + mov DWORD PTR [rdi+4], r9d + mov DWORD PTR [rdi+8], r10d + mov DWORD PTR [rdi+12], r11d + mov DWORD PTR [rdi+16], r12d + mov DWORD PTR [rdi+20], r13d + mov DWORD PTR [rdi+24], r14d + mov DWORD PTR [rdi+28], r15d + jnz L_sha256_len_avx2_start +L_sha256_len_avx2_done: + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+512] + vmovdqu xmm7, OWORD PTR [rsp+528] + vmovdqu xmm8, OWORD PTR [rsp+544] + vmovdqu xmm9, OWORD PTR [rsp+560] + vmovdqu xmm10, OWORD PTR [rsp+576] + vmovdqu xmm11, OWORD PTR [rsp+592] + vmovdqu xmm12, OWORD PTR [rsp+608] + vmovdqu xmm13, OWORD PTR [rsp+624] + add rsp, 644 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX2_Len ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_rorx_sha256_k DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 428a2f98h, 71374491h, 0b5c0fbcfh, 0e9b5dba5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 3956c25bh, 59f111f1h, 923f82a4h, 0ab1c5ed5h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 0d807aa98h, 12835b01h, 243185beh, 550c7dc3h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 72be5d74h, 80deb1feh, 9bdc06a7h, 0c19bf174h + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 0e49b69c1h, 0efbe4786h, 0fc19dc6h, 240ca1cch + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 2de92c6fh, 4a7484aah, 5cb0a9dch, 76f988dah + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 983e5152h, 0a831c66dh, 0b00327c8h, 0bf597fc7h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 0c6e00bf3h, 0d5a79147h, 06ca6351h, 14292967h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 27b70a85h, 2e1b2138h, 4d2c6dfch, 53380d13h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 650a7354h, 766a0abbh, 81c2c92eh, 92722c85h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0a2bfe8a1h, 0a81a664bh, 0c24b8b70h, 0c76c51a3h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 0d192e819h, 0d6990624h, 0f40e3585h, 106aa070h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 19a4c116h, 1e376c08h, 2748774ch, 34b0bcb5h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 391c0cb3h, 4ed8aa4ah, 5b9cca4fh, 682e6ff3h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 748f82eeh, 78a5636fh, 84c87814h, 8cc70208h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h + DWORD 90befffah, 0a4506cebh, 0bef9a3f7h, 0c67178f2h +ptr_L_avx2_rorx_sha256_k QWORD L_avx2_rorx_sha256_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_rorx_sha256_flip_mask QWORD 0405060700010203h, 0c0d0e0f08090a0bh + QWORD 0405060700010203h, 0c0d0e0f08090a0bh +ptr_L_avx2_rorx_sha256_flip_mask QWORD L_avx2_rorx_sha256_flip_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_rorx_sha256_shuf_00BA QWORD 0b0a090803020100h, 0ffffffffffffffffh + QWORD 0b0a090803020100h, 0ffffffffffffffffh +ptr_L_avx2_rorx_sha256_shuf_00BA QWORD L_avx2_rorx_sha256_shuf_00BA +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_rorx_sha256_shuf_DC00 QWORD 0ffffffffffffffffh, 0b0a090803020100h + QWORD 0ffffffffffffffffh, 0b0a090803020100h +ptr_L_avx2_rorx_sha256_shuf_DC00 QWORD L_avx2_rorx_sha256_shuf_DC00 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX2_RORX PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rbp + push rdi + push rsi + mov rdi, rcx + mov rsi, rdx + sub rsp, 640 + vmovdqu OWORD PTR [rsp+512], xmm6 + vmovdqu OWORD PTR [rsp+528], xmm7 + vmovdqu OWORD PTR [rsp+544], xmm8 + vmovdqu OWORD PTR [rsp+560], xmm9 + vmovdqu OWORD PTR [rsp+576], xmm10 + vmovdqu OWORD PTR [rsp+592], xmm11 + vmovdqu OWORD PTR [rsp+608], xmm12 + vmovdqu OWORD PTR [rsp+624], xmm13 + mov rbp, QWORD PTR [ptr_L_avx2_rorx_sha256_k] + vmovdqa xmm13, OWORD PTR L_avx2_rorx_sha256_flip_mask + vmovdqu ymm11, YMMWORD PTR L_avx2_rorx_sha256_shuf_00BA + vmovdqu ymm12, YMMWORD PTR L_avx2_rorx_sha256_shuf_DC00 + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm13 + vpshufb xmm1, xmm1, xmm13 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32] + vmovdqu YMMWORD PTR [rsp], ymm4 + vmovdqu YMMWORD PTR [rsp+32], ymm5 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm13 + vpshufb xmm3, xmm3, xmm13 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96] + vmovdqu YMMWORD PTR [rsp+64], ymm4 + vmovdqu YMMWORD PTR [rsp+96], ymm5 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + mov ebx, r9d + rorx edx, r12d, 6 + xor ebx, r10d + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr ymm5, ymm1, ymm0, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm3, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm0, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+128], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+32] + vpalignr ymm5, ymm2, ymm1, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+36] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm0, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+40] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+44] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm1, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm1, YMMWORD PTR [rbp+160] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+160], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+64] + vpalignr ymm5, ymm3, ymm2, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+68] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm1, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+72] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+76] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm2, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+192], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+96] + vpalignr ymm5, ymm0, ymm3, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+100] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm2, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+104] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+108] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm3, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm3, YMMWORD PTR [rbp+224] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+224], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+128] + vpalignr ymm5, ymm1, ymm0, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+132] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm3, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+136] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+140] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm0, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+256], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+160] + vpalignr ymm5, ymm2, ymm1, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+164] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm0, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+168] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+172] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm1, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm1, YMMWORD PTR [rbp+288] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+288], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+192] + vpalignr ymm5, ymm3, ymm2, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+196] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm1, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+200] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+204] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm2, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+320], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+224] + vpalignr ymm5, ymm0, ymm3, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+228] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm2, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+232] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+236] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm3, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm3, YMMWORD PTR [rbp+352] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+352], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+256] + vpalignr ymm5, ymm1, ymm0, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+260] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm3, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+264] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+268] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm0, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+384], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+288] + vpalignr ymm5, ymm2, ymm1, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+292] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm0, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+296] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+300] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm1, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm1, YMMWORD PTR [rbp+416] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+416], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+320] + vpalignr ymm5, ymm3, ymm2, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+324] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm1, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+328] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+332] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm2, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+448], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+352] + vpalignr ymm5, ymm0, ymm3, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+356] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm2, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+360] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+364] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm3, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm3, YMMWORD PTR [rbp+480] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+480], ymm4 + xor eax, eax + xor ecx, ecx + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+384] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+388] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+392] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+396] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+416] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+420] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+424] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+428] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+448] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+452] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+456] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+460] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+480] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+484] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+488] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+492] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + add r8d, eax + add DWORD PTR [rdi], r8d + add DWORD PTR [rdi+4], r9d + add DWORD PTR [rdi+8], r10d + add DWORD PTR [rdi+12], r11d + add DWORD PTR [rdi+16], r12d + add DWORD PTR [rdi+20], r13d + add DWORD PTR [rdi+24], r14d + add DWORD PTR [rdi+28], r15d + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+512] + vmovdqu xmm7, OWORD PTR [rsp+528] + vmovdqu xmm8, OWORD PTR [rsp+544] + vmovdqu xmm9, OWORD PTR [rsp+560] + vmovdqu xmm10, OWORD PTR [rsp+576] + vmovdqu xmm11, OWORD PTR [rsp+592] + vmovdqu xmm12, OWORD PTR [rsp+608] + vmovdqu xmm13, OWORD PTR [rsp+624] + add rsp, 640 + pop rsi + pop rdi + pop rbp + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX2_RORX ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha256_AVX2_RORX_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rsi, rdx + mov rbp, r8 + sub rsp, 644 + vmovdqu OWORD PTR [rsp+512], xmm6 + vmovdqu OWORD PTR [rsp+528], xmm7 + vmovdqu OWORD PTR [rsp+544], xmm8 + vmovdqu OWORD PTR [rsp+560], xmm9 + vmovdqu OWORD PTR [rsp+576], xmm10 + vmovdqu OWORD PTR [rsp+592], xmm11 + vmovdqu OWORD PTR [rsp+608], xmm12 + vmovdqu OWORD PTR [rsp+624], xmm13 + test bpl, 64 + mov DWORD PTR [rsp+512], ebp + je L_sha256_len_avx2_rorx_block + vmovdqu ymm0, YMMWORD PTR [rsi] + vmovdqu ymm1, YMMWORD PTR [rsi+32] + vmovups YMMWORD PTR [rdi+32], ymm0 + vmovups YMMWORD PTR [rdi+64], ymm1 + call Transform_Sha256_AVX2_RORX + add rsi, 64 + sub DWORD PTR [rsp+512], 64 + jz L_sha256_len_avx2_rorx_done +L_sha256_len_avx2_rorx_block: + mov rbp, QWORD PTR [ptr_L_avx2_rorx_sha256_k] + vmovdqu ymm13, YMMWORD PTR L_avx2_rorx_sha256_flip_mask + vmovdqu ymm11, YMMWORD PTR L_avx2_rorx_sha256_shuf_00BA + vmovdqu ymm12, YMMWORD PTR L_avx2_rorx_sha256_shuf_DC00 + mov r8d, DWORD PTR [rdi] + mov r9d, DWORD PTR [rdi+4] + mov r10d, DWORD PTR [rdi+8] + mov r11d, DWORD PTR [rdi+12] + mov r12d, DWORD PTR [rdi+16] + mov r13d, DWORD PTR [rdi+20] + mov r14d, DWORD PTR [rdi+24] + mov r15d, DWORD PTR [rdi+28] + ; Start of loop processing two blocks +L_sha256_len_avx2_rorx_start: + ; X0, X1, X2, X3 = W[0..15] + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vinserti128 ymm0, ymm0, OWORD PTR [rsi+64], 1 + vinserti128 ymm1, ymm1, OWORD PTR [rsi+80], 1 + vpshufb ymm0, ymm0, ymm13 + vpshufb ymm1, ymm1, ymm13 + vpaddd ymm4, ymm0, YMMWORD PTR [rbp] + vpaddd ymm5, ymm1, YMMWORD PTR [rbp+32] + vmovdqu YMMWORD PTR [rsp], ymm4 + vmovdqu YMMWORD PTR [rsp+32], ymm5 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vinserti128 ymm2, ymm2, OWORD PTR [rsi+96], 1 + vinserti128 ymm3, ymm3, OWORD PTR [rsi+112], 1 + vpshufb ymm2, ymm2, ymm13 + vpshufb ymm3, ymm3, ymm13 + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+64] + vpaddd ymm5, ymm3, YMMWORD PTR [rbp+96] + vmovdqu YMMWORD PTR [rsp+64], ymm4 + vmovdqu YMMWORD PTR [rsp+96], ymm5 + mov ebx, r9d + rorx edx, r12d, 6 + xor ebx, r10d + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp] + vpalignr ymm5, ymm1, ymm0, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+4] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm3, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+8] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+12] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm0, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+128] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+128], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+32] + vpalignr ymm5, ymm2, ymm1, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+36] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm0, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+40] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+44] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm1, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm1, YMMWORD PTR [rbp+160] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+160], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+64] + vpalignr ymm5, ymm3, ymm2, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+68] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm1, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+72] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+76] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm2, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+192] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+192], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+96] + vpalignr ymm5, ymm0, ymm3, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+100] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm2, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+104] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+108] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm3, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm3, YMMWORD PTR [rbp+224] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+224], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+128] + vpalignr ymm5, ymm1, ymm0, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+132] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm3, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+136] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+140] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm0, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+256] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+256], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+160] + vpalignr ymm5, ymm2, ymm1, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+164] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm0, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+168] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+172] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm1, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm1, YMMWORD PTR [rbp+288] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+288], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+192] + vpalignr ymm5, ymm3, ymm2, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+196] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm1, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+200] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+204] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm2, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+320] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+320], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+224] + vpalignr ymm5, ymm0, ymm3, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+228] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm2, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+232] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+236] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm3, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm3, YMMWORD PTR [rbp+352] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+352], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+256] + vpalignr ymm5, ymm1, ymm0, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm3, ymm2, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+260] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm3, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm0 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+264] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+268] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm0, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm0, YMMWORD PTR [rbp+384] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+384], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+288] + vpalignr ymm5, ymm2, ymm1, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm0, ymm3, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+292] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm0, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm1 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+296] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+300] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm1, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm1, YMMWORD PTR [rbp+416] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+416], ymm4 + ; rnd_0: 0 - 0 + mov eax, r13d + rorx ecx, r12d, 11 + add r15d, DWORD PTR [rsp+320] + vpalignr ymm5, ymm3, ymm2, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + vpalignr ymm4, ymm1, ymm0, 4 + ; rnd_0: 2 - 2 + and eax, r12d + xor edx, ecx + rorx ecx, r8d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r15d, edx + rorx edx, r8d, 2 + xor eax, r14d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r9d + add r11d, r15d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r8d + add r15d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r9d + rorx edx, r11d, 6 + add r15d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r12d + rorx ecx, r11d, 11 + add r14d, DWORD PTR [rsp+324] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r11d + xor edx, ecx + rorx ecx, r15d, 13 + vpshufd ymm7, ymm1, 250 + ; rnd_1: 3 - 3 + add r14d, edx + rorx edx, r15d, 2 + xor ebx, r13d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r10d, r14d + mov ebx, r8d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r15d + add r14d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r8d + rorx edx, r10d, 6 + add r14d, eax + vpaddd ymm4, ymm4, ymm2 + ; rnd_0: 0 - 0 + mov eax, r11d + rorx ecx, r10d, 11 + add r13d, DWORD PTR [rsp+328] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r10d + xor edx, ecx + rorx ecx, r14d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r13d, edx + rorx edx, r14d, 2 + xor eax, r12d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r15d + add r9d, r13d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r14d + add r13d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r15d + rorx edx, r9d, 6 + add r13d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r10d + rorx ecx, r9d, 11 + add r12d, DWORD PTR [rsp+332] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r9d + xor edx, ecx + rorx ecx, r13d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r12d, edx + rorx edx, r13d, 2 + xor ebx, r11d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + vpaddd ymm2, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r8d, r12d + mov ebx, r14d + vpaddd ymm4, ymm2, YMMWORD PTR [rbp+448] + ; rnd_1: 6 - 6 + xor ebx, r13d + add r12d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r14d + rorx edx, r8d, 6 + add r12d, eax + vmovdqu YMMWORD PTR [rsp+448], ymm4 + ; rnd_0: 0 - 0 + mov eax, r9d + rorx ecx, r8d, 11 + add r11d, DWORD PTR [rsp+352] + vpalignr ymm5, ymm0, ymm3, 4 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + vpalignr ymm4, ymm2, ymm1, 4 + ; rnd_0: 2 - 2 + and eax, r8d + xor edx, ecx + rorx ecx, r12d, 13 + vpsrld ymm6, ymm5, 7 + ; rnd_0: 3 - 3 + add r11d, edx + rorx edx, r12d, 2 + xor eax, r10d + vpslld ymm7, ymm5, 25 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + vpsrld ymm8, ymm5, 18 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r13d + add r15d, r11d + vpslld ymm9, ymm5, 14 + ; rnd_0: 6 - 6 + xor eax, r12d + add r11d, edx + and ebx, eax + vpor ymm6, ymm6, ymm7 + ; rnd_0: 7 - 7 + xor ebx, r13d + rorx edx, r15d, 6 + add r11d, ebx + vpor ymm8, ymm8, ymm9 + ; rnd_1: 0 - 0 + mov ebx, r8d + rorx ecx, r15d, 11 + add r10d, DWORD PTR [rsp+356] + vpsrld ymm9, ymm5, 3 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + vpxor ymm6, ymm6, ymm8 + ; rnd_1: 2 - 2 + and ebx, r15d + xor edx, ecx + rorx ecx, r11d, 13 + vpshufd ymm7, ymm2, 250 + ; rnd_1: 3 - 3 + add r10d, edx + rorx edx, r11d, 2 + xor ebx, r9d + vpxor ymm5, ymm9, ymm6 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + vpsrld ymm8, ymm7, 10 + ; rnd_1: 5 - 5 + xor edx, ecx + add r14d, r10d + mov ebx, r12d + vpsrlq ymm6, ymm7, 19 + ; rnd_1: 6 - 6 + xor ebx, r11d + add r10d, edx + and eax, ebx + vpsrlq ymm7, ymm7, 17 + ; rnd_1: 7 - 7 + xor eax, r12d + rorx edx, r14d, 6 + add r10d, eax + vpaddd ymm4, ymm4, ymm3 + ; rnd_0: 0 - 0 + mov eax, r15d + rorx ecx, r14d, 11 + add r9d, DWORD PTR [rsp+360] + vpxor ymm6, ymm6, ymm7 + ; rnd_0: 1 - 1 + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + vpxor ymm8, ymm8, ymm6 + ; rnd_0: 2 - 2 + and eax, r14d + xor edx, ecx + rorx ecx, r10d, 13 + vpaddd ymm4, ymm4, ymm5 + ; rnd_0: 3 - 3 + add r9d, edx + rorx edx, r10d, 2 + xor eax, r8d + vpshufb ymm8, ymm8, ymm11 + ; rnd_0: 4 - 4 + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + vpaddd ymm4, ymm4, ymm8 + ; rnd_0: 5 - 5 + xor edx, ecx + mov eax, r11d + add r13d, r9d + vpshufd ymm6, ymm4, 80 + ; rnd_0: 6 - 6 + xor eax, r10d + add r9d, edx + and ebx, eax + vpsrlq ymm8, ymm6, 17 + ; rnd_0: 7 - 7 + xor ebx, r11d + rorx edx, r13d, 6 + add r9d, ebx + vpsrlq ymm7, ymm6, 19 + ; rnd_1: 0 - 0 + mov ebx, r14d + rorx ecx, r13d, 11 + add r8d, DWORD PTR [rsp+364] + vpsrld ymm9, ymm6, 10 + ; rnd_1: 1 - 1 + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + vpxor ymm8, ymm8, ymm7 + ; rnd_1: 2 - 2 + and ebx, r13d + xor edx, ecx + rorx ecx, r9d, 13 + vpxor ymm9, ymm9, ymm8 + ; rnd_1: 3 - 3 + add r8d, edx + rorx edx, r9d, 2 + xor ebx, r15d + vpshufb ymm9, ymm9, ymm12 + ; rnd_1: 4 - 4 + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + vpaddd ymm3, ymm9, ymm4 + ; rnd_1: 5 - 5 + xor edx, ecx + add r12d, r8d + mov ebx, r10d + vpaddd ymm4, ymm3, YMMWORD PTR [rbp+480] + ; rnd_1: 6 - 6 + xor ebx, r9d + add r8d, edx + and eax, ebx + ; rnd_1: 7 - 7 + xor eax, r10d + rorx edx, r12d, 6 + add r8d, eax + vmovdqu YMMWORD PTR [rsp+480], ymm4 + xor eax, eax + xor ecx, ecx + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+384] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+388] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+392] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+396] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+416] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+420] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+424] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+428] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+448] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+452] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+456] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+460] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+480] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+484] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+488] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+492] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + add r8d, eax + xor ecx, ecx + add r8d, DWORD PTR [rdi] + add r9d, DWORD PTR [rdi+4] + add r10d, DWORD PTR [rdi+8] + add r11d, DWORD PTR [rdi+12] + add r12d, DWORD PTR [rdi+16] + add r13d, DWORD PTR [rdi+20] + add r14d, DWORD PTR [rdi+24] + add r15d, DWORD PTR [rdi+28] + mov DWORD PTR [rdi], r8d + mov DWORD PTR [rdi+4], r9d + mov DWORD PTR [rdi+8], r10d + mov DWORD PTR [rdi+12], r11d + mov DWORD PTR [rdi+16], r12d + mov DWORD PTR [rdi+20], r13d + mov DWORD PTR [rdi+24], r14d + mov DWORD PTR [rdi+28], r15d + mov ebx, r9d + xor eax, eax + xor ebx, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+16] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+20] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+24] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+28] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+48] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+52] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+56] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+60] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+80] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+84] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+88] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+92] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+112] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+116] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+120] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+124] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+144] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+148] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+152] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+156] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+176] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+180] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+184] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+188] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+208] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+212] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+216] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+220] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+240] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+244] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+248] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+252] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+272] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+276] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+280] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+284] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+304] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+308] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+312] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+316] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+336] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+340] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+344] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+348] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+368] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+372] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+376] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+380] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+400] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+404] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+408] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+412] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+432] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+436] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+440] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+444] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + rorx edx, r12d, 6 + rorx ecx, r12d, 11 + lea r8d, DWORD PTR [r8+rax] + add r15d, DWORD PTR [rsp+464] + mov eax, r13d + xor ecx, edx + xor eax, r14d + rorx edx, r12d, 25 + xor edx, ecx + and eax, r12d + add r15d, edx + rorx edx, r8d, 2 + rorx ecx, r8d, 13 + xor eax, r14d + xor ecx, edx + rorx edx, r8d, 22 + add r15d, eax + xor edx, ecx + mov eax, r9d + add r11d, r15d + xor eax, r8d + and ebx, eax + add r15d, edx + xor ebx, r9d + rorx edx, r11d, 6 + rorx ecx, r11d, 11 + add r15d, ebx + add r14d, DWORD PTR [rsp+468] + mov ebx, r12d + xor ecx, edx + xor ebx, r13d + rorx edx, r11d, 25 + xor edx, ecx + and ebx, r11d + add r14d, edx + rorx edx, r15d, 2 + rorx ecx, r15d, 13 + xor ebx, r13d + xor ecx, edx + rorx edx, r15d, 22 + add r14d, ebx + xor edx, ecx + mov ebx, r8d + lea r10d, DWORD PTR [r10+r14] + xor ebx, r15d + and eax, ebx + add r14d, edx + xor eax, r8d + rorx edx, r10d, 6 + rorx ecx, r10d, 11 + lea r14d, DWORD PTR [r14+rax] + add r13d, DWORD PTR [rsp+472] + mov eax, r11d + xor ecx, edx + xor eax, r12d + rorx edx, r10d, 25 + xor edx, ecx + and eax, r10d + add r13d, edx + rorx edx, r14d, 2 + rorx ecx, r14d, 13 + xor eax, r12d + xor ecx, edx + rorx edx, r14d, 22 + add r13d, eax + xor edx, ecx + mov eax, r15d + add r9d, r13d + xor eax, r14d + and ebx, eax + add r13d, edx + xor ebx, r15d + rorx edx, r9d, 6 + rorx ecx, r9d, 11 + add r13d, ebx + add r12d, DWORD PTR [rsp+476] + mov ebx, r10d + xor ecx, edx + xor ebx, r11d + rorx edx, r9d, 25 + xor edx, ecx + and ebx, r9d + add r12d, edx + rorx edx, r13d, 2 + rorx ecx, r13d, 13 + xor ebx, r11d + xor ecx, edx + rorx edx, r13d, 22 + add r12d, ebx + xor edx, ecx + mov ebx, r14d + lea r8d, DWORD PTR [r8+r12] + xor ebx, r13d + and eax, ebx + add r12d, edx + xor eax, r14d + rorx edx, r8d, 6 + rorx ecx, r8d, 11 + lea r12d, DWORD PTR [r12+rax] + add r11d, DWORD PTR [rsp+496] + mov eax, r9d + xor ecx, edx + xor eax, r10d + rorx edx, r8d, 25 + xor edx, ecx + and eax, r8d + add r11d, edx + rorx edx, r12d, 2 + rorx ecx, r12d, 13 + xor eax, r10d + xor ecx, edx + rorx edx, r12d, 22 + add r11d, eax + xor edx, ecx + mov eax, r13d + add r15d, r11d + xor eax, r12d + and ebx, eax + add r11d, edx + xor ebx, r13d + rorx edx, r15d, 6 + rorx ecx, r15d, 11 + add r11d, ebx + add r10d, DWORD PTR [rsp+500] + mov ebx, r8d + xor ecx, edx + xor ebx, r9d + rorx edx, r15d, 25 + xor edx, ecx + and ebx, r15d + add r10d, edx + rorx edx, r11d, 2 + rorx ecx, r11d, 13 + xor ebx, r9d + xor ecx, edx + rorx edx, r11d, 22 + add r10d, ebx + xor edx, ecx + mov ebx, r12d + lea r14d, DWORD PTR [r14+r10] + xor ebx, r11d + and eax, ebx + add r10d, edx + xor eax, r12d + rorx edx, r14d, 6 + rorx ecx, r14d, 11 + lea r10d, DWORD PTR [r10+rax] + add r9d, DWORD PTR [rsp+504] + mov eax, r15d + xor ecx, edx + xor eax, r8d + rorx edx, r14d, 25 + xor edx, ecx + and eax, r14d + add r9d, edx + rorx edx, r10d, 2 + rorx ecx, r10d, 13 + xor eax, r8d + xor ecx, edx + rorx edx, r10d, 22 + add r9d, eax + xor edx, ecx + mov eax, r11d + add r13d, r9d + xor eax, r10d + and ebx, eax + add r9d, edx + xor ebx, r11d + rorx edx, r13d, 6 + rorx ecx, r13d, 11 + add r9d, ebx + add r8d, DWORD PTR [rsp+508] + mov ebx, r14d + xor ecx, edx + xor ebx, r15d + rorx edx, r13d, 25 + xor edx, ecx + and ebx, r13d + add r8d, edx + rorx edx, r9d, 2 + rorx ecx, r9d, 13 + xor ebx, r15d + xor ecx, edx + rorx edx, r9d, 22 + add r8d, ebx + xor edx, ecx + mov ebx, r10d + lea r12d, DWORD PTR [r12+r8] + xor ebx, r9d + and eax, ebx + add r8d, edx + xor eax, r10d + add r8d, eax + add rsi, 128 + add r8d, DWORD PTR [rdi] + add r9d, DWORD PTR [rdi+4] + add r10d, DWORD PTR [rdi+8] + add r11d, DWORD PTR [rdi+12] + add r12d, DWORD PTR [rdi+16] + add r13d, DWORD PTR [rdi+20] + add r14d, DWORD PTR [rdi+24] + add r15d, DWORD PTR [rdi+28] + sub DWORD PTR [rsp+512], 128 + mov DWORD PTR [rdi], r8d + mov DWORD PTR [rdi+4], r9d + mov DWORD PTR [rdi+8], r10d + mov DWORD PTR [rdi+12], r11d + mov DWORD PTR [rdi+16], r12d + mov DWORD PTR [rdi+20], r13d + mov DWORD PTR [rdi+24], r14d + mov DWORD PTR [rdi+28], r15d + jnz L_sha256_len_avx2_rorx_start +L_sha256_len_avx2_rorx_done: + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+512] + vmovdqu xmm7, OWORD PTR [rsp+528] + vmovdqu xmm8, OWORD PTR [rsp+544] + vmovdqu xmm9, OWORD PTR [rsp+560] + vmovdqu xmm10, OWORD PTR [rsp+576] + vmovdqu xmm11, OWORD PTR [rsp+592] + vmovdqu xmm12, OWORD PTR [rsp+608] + vmovdqu xmm13, OWORD PTR [rsp+624] + add rsp, 644 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha256_AVX2_RORX_Len ENDP +_TEXT ENDS +ENDIF +ENDIF +END diff --git a/wolfcrypt/src/sha3_asm.asm b/wolfcrypt/src/sha3_asm.asm new file mode 100644 index 00000000000..8f4db30ff57 --- /dev/null +++ b/wolfcrypt/src/sha3_asm.asm @@ -0,0 +1,31448 @@ +; /* sha3_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +_DATA SEGMENT +ALIGN 16 +L_sha3_avx2_r QWORD 0000000000000001h, 0000000000000001h + QWORD 0000000000000001h, 0000000000000001h + QWORD 0000000000008082h, 0000000000008082h + QWORD 0000000000008082h, 0000000000008082h + QWORD 800000000000808ah, 800000000000808ah + QWORD 800000000000808ah, 800000000000808ah + QWORD 8000000080008000h, 8000000080008000h + QWORD 8000000080008000h, 8000000080008000h + QWORD 000000000000808bh, 000000000000808bh + QWORD 000000000000808bh, 000000000000808bh + QWORD 0000000080000001h, 0000000080000001h + QWORD 0000000080000001h, 0000000080000001h + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000000008009h, 8000000000008009h + QWORD 8000000000008009h, 8000000000008009h + QWORD 000000000000008ah, 000000000000008ah + QWORD 000000000000008ah, 000000000000008ah + QWORD 0000000000000088h, 0000000000000088h + QWORD 0000000000000088h, 0000000000000088h + QWORD 0000000080008009h, 0000000080008009h + QWORD 0000000080008009h, 0000000080008009h + QWORD 000000008000000ah, 000000008000000ah + QWORD 000000008000000ah, 000000008000000ah + QWORD 000000008000808bh, 000000008000808bh + QWORD 000000008000808bh, 000000008000808bh + QWORD 800000000000008bh, 800000000000008bh + QWORD 800000000000008bh, 800000000000008bh + QWORD 8000000000008089h, 8000000000008089h + QWORD 8000000000008089h, 8000000000008089h + QWORD 8000000000008003h, 8000000000008003h + QWORD 8000000000008003h, 8000000000008003h + QWORD 8000000000008002h, 8000000000008002h + QWORD 8000000000008002h, 8000000000008002h + QWORD 8000000000000080h, 8000000000000080h + QWORD 8000000000000080h, 8000000000000080h + QWORD 000000000000800ah, 000000000000800ah + QWORD 000000000000800ah, 000000000000800ah + QWORD 800000008000000ah, 800000008000000ah + QWORD 800000008000000ah, 800000008000000ah + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000000008080h, 8000000000008080h + QWORD 8000000000008080h, 8000000000008080h + QWORD 0000000080000001h, 0000000080000001h + QWORD 0000000080000001h, 0000000080000001h + QWORD 8000000080008008h, 8000000080008008h + QWORD 8000000080008008h, 8000000080008008h +ptr_L_sha3_avx2_r QWORD L_sha3_avx2_r +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_sha3_x4_avx2_r QWORD 0000000000000001h, 0000000000000001h + QWORD 0000000000000001h, 0000000000000001h + QWORD 0000000000008082h, 0000000000008082h + QWORD 0000000000008082h, 0000000000008082h + QWORD 800000000000808ah, 800000000000808ah + QWORD 800000000000808ah, 800000000000808ah + QWORD 8000000080008000h, 8000000080008000h + QWORD 8000000080008000h, 8000000080008000h + QWORD 000000000000808bh, 000000000000808bh + QWORD 000000000000808bh, 000000000000808bh + QWORD 0000000080000001h, 0000000080000001h + QWORD 0000000080000001h, 0000000080000001h + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000000008009h, 8000000000008009h + QWORD 8000000000008009h, 8000000000008009h + QWORD 000000000000008ah, 000000000000008ah + QWORD 000000000000008ah, 000000000000008ah + QWORD 0000000000000088h, 0000000000000088h + QWORD 0000000000000088h, 0000000000000088h + QWORD 0000000080008009h, 0000000080008009h + QWORD 0000000080008009h, 0000000080008009h + QWORD 000000008000000ah, 000000008000000ah + QWORD 000000008000000ah, 000000008000000ah + QWORD 000000008000808bh, 000000008000808bh + QWORD 000000008000808bh, 000000008000808bh + QWORD 800000000000008bh, 800000000000008bh + QWORD 800000000000008bh, 800000000000008bh + QWORD 8000000000008089h, 8000000000008089h + QWORD 8000000000008089h, 8000000000008089h + QWORD 8000000000008003h, 8000000000008003h + QWORD 8000000000008003h, 8000000000008003h + QWORD 8000000000008002h, 8000000000008002h + QWORD 8000000000008002h, 8000000000008002h + QWORD 8000000000000080h, 8000000000000080h + QWORD 8000000000000080h, 8000000000000080h + QWORD 000000000000800ah, 000000000000800ah + QWORD 000000000000800ah, 000000000000800ah + QWORD 800000008000000ah, 800000008000000ah + QWORD 800000008000000ah, 800000008000000ah + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000080008081h, 8000000080008081h + QWORD 8000000000008080h, 8000000000008080h + QWORD 8000000000008080h, 8000000000008080h + QWORD 0000000080000001h, 0000000080000001h + QWORD 0000000080000001h, 0000000080000001h + QWORD 8000000080008008h, 8000000080008008h + QWORD 8000000080008008h, 8000000080008008h +ptr_L_sha3_x4_avx2_r QWORD L_sha3_x4_avx2_r +_DATA ENDS +IFDEF HAVE_INTEL_AVX2 +_TEXT SEGMENT READONLY PARA +sha3_block_bmi2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rsi, QWORD PTR [rcx] + add rcx, 96 + ; Round 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-88] + mov r13, QWORD PTR [rcx+-80] + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+-64] + xor r11, QWORD PTR [rcx+-56] + xor r12, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+-40] + xor r14, QWORD PTR [rcx+-32] + xor r15, QWORD PTR [rcx+-24] + xor r11, QWORD PTR [rcx+-16] + xor r12, QWORD PTR [rcx+-8] + xor r13, QWORD PTR [rcx] + xor r14, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+16] + xor r11, QWORD PTR [rcx+24] + xor r12, QWORD PTR [rcx+32] + xor r13, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+56] + xor r11, QWORD PTR [rcx+64] + xor r12, QWORD PTR [rcx+72] + xor r13, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-48] + mov r13, QWORD PTR [rcx] + mov r14, QWORD PTR [rcx+48] + mov r15, QWORD PTR [rcx+96] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+48], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+96], r14 + ; XOR in constant + xor rsi, 1 + ; Row 1 + mov r11, QWORD PTR [rcx+-72] + mov r12, QWORD PTR [rcx+-24] + mov r13, QWORD PTR [rcx+-16] + mov r14, QWORD PTR [rcx+32] + mov r15, QWORD PTR [rcx+80] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+32], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+80], rdi + mov QWORD PTR [rcx+-72], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-88] + mov r12, QWORD PTR [rcx+-40] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+56] + mov r15, QWORD PTR [rcx+64] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+56], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+64], rdi + mov QWORD PTR [rcx+-88], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-64] + mov r12, QWORD PTR [rcx+-56] + mov r13, QWORD PTR [rcx+-8] + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx+88] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+40], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+88], rdi + mov QWORD PTR [rcx+-64], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-80] + xor r9, QWORD PTR [rcx+-32] + xor r10, QWORD PTR [rcx+16] + xor rdx, QWORD PTR [rcx+24] + xor rax, QWORD PTR [rcx+72] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-80], r11 + mov QWORD PTR [rcx+-32], r12 + mov QWORD PTR [rcx+16], r13 + mov QWORD PTR [rcx+24], r14 + mov QWORD PTR [rcx+72], r15 + ; Round 1 + xor r11, rsi + xor r11, QWORD PTR [rcx+-88] + xor r11, QWORD PTR [rcx+-72] + xor r11, QWORD PTR [rcx+-64] + xor r12, QWORD PTR [rcx+-56] + xor r12, QWORD PTR [rcx+-48] + xor r12, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-24] + xor r13, QWORD PTR [rcx+-16] + xor r13, QWORD PTR [rcx+-8] + xor r13, QWORD PTR [rcx] + xor r13, QWORD PTR [rcx+8] + xor r14, QWORD PTR [rcx+32] + xor r14, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+48] + xor r14, QWORD PTR [rcx+56] + xor r15, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+80] + xor r15, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-24] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx+72] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+40], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+72], r14 + ; XOR in constant + xor rsi, 32898 + ; Row 1 + mov r11, QWORD PTR [rcx+48] + mov r12, QWORD PTR [rcx+80] + mov r13, QWORD PTR [rcx+-88] + mov r14, QWORD PTR [rcx+-56] + mov r15, QWORD PTR [rcx+16] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+16], rdi + mov QWORD PTR [rcx+48], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-48] + mov r12, QWORD PTR [rcx+-16] + mov r13, QWORD PTR [rcx+56] + mov r14, QWORD PTR [rcx+88] + mov r15, QWORD PTR [rcx+-80] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+88], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+-48], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+96] + mov r12, QWORD PTR [rcx+-72] + mov r13, QWORD PTR [rcx+-40] + mov r14, QWORD PTR [rcx+-8] + mov r15, QWORD PTR [rcx+24] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+24], rdi + mov QWORD PTR [rcx+96], r14 + ; Row 4 + xor r8, QWORD PTR [rcx] + xor r9, QWORD PTR [rcx+32] + xor r10, QWORD PTR [rcx+64] + xor rdx, QWORD PTR [rcx+-64] + xor rax, QWORD PTR [rcx+-32] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+32], r12 + mov QWORD PTR [rcx+64], r13 + mov QWORD PTR [rcx+-64], r14 + mov QWORD PTR [rcx+-32], r15 + ; Round 2 + xor r11, rsi + xor r13, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-80] + xor r12, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-56] + xor r11, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-24] + xor r12, QWORD PTR [rcx+-16] + xor r14, QWORD PTR [rcx+-8] + xor r13, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+16] + xor r15, QWORD PTR [rcx+24] + xor r14, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+48] + xor r13, QWORD PTR [rcx+56] + xor r15, QWORD PTR [rcx+72] + xor r12, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+88] + xor r11, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+80] + mov r13, QWORD PTR [rcx+56] + mov r14, QWORD PTR [rcx+-8] + mov r15, QWORD PTR [rcx+-32] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-8], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-32], r14 + ; XOR in constant + mov r15, 9223372036854808714 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+40] + mov r12, QWORD PTR [rcx+16] + mov r13, QWORD PTR [rcx+-48] + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+64] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+64], rdi + mov QWORD PTR [rcx+40], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-24] + mov r12, QWORD PTR [rcx+-88] + mov r13, QWORD PTR [rcx+88] + mov r14, QWORD PTR [rcx+24] + mov r15, QWORD PTR [rcx] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+24], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+-24], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+72] + mov r12, QWORD PTR [rcx+48] + mov r13, QWORD PTR [rcx+-16] + mov r14, QWORD PTR [rcx+-40] + mov r15, QWORD PTR [rcx+-64] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-64], rdi + mov QWORD PTR [rcx+72], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+8] + xor r9, QWORD PTR [rcx+-56] + xor r10, QWORD PTR [rcx+-80] + xor rdx, QWORD PTR [rcx+96] + xor rax, QWORD PTR [rcx+32] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+8], r11 + mov QWORD PTR [rcx+-56], r12 + mov QWORD PTR [rcx+-80], r13 + mov QWORD PTR [rcx+96], r14 + mov QWORD PTR [rcx+32], r15 + ; Round 3 + xor r11, rsi + xor r12, QWORD PTR [rcx+-88] + xor r14, QWORD PTR [rcx+-72] + xor r15, QWORD PTR [rcx+-64] + xor r13, QWORD PTR [rcx+-48] + xor r14, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor r11, QWORD PTR [rcx+-24] + xor r13, QWORD PTR [rcx+-16] + xor r14, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx] + xor r12, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+40] + xor r12, QWORD PTR [rcx+48] + xor r13, QWORD PTR [rcx+56] + xor r15, QWORD PTR [rcx+64] + xor r11, QWORD PTR [rcx+72] + xor r12, QWORD PTR [rcx+80] + xor r13, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+16] + mov r13, QWORD PTR [rcx+88] + mov r14, QWORD PTR [rcx+-40] + mov r15, QWORD PTR [rcx+32] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-40], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+32], r14 + ; XOR in constant + mov r15, 9223372039002292224 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+-8] + mov r12, QWORD PTR [rcx+64] + mov r13, QWORD PTR [rcx+-24] + mov r14, QWORD PTR [rcx+48] + mov r15, QWORD PTR [rcx+-80] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+48], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+-8], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+80] + mov r12, QWORD PTR [rcx+-48] + mov r13, QWORD PTR [rcx+24] + mov r14, QWORD PTR [rcx+-64] + mov r15, QWORD PTR [rcx+8] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+8], rdi + mov QWORD PTR [rcx+80], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-32] + mov r12, QWORD PTR [rcx+40] + mov r13, QWORD PTR [rcx+-88] + mov r14, QWORD PTR [rcx+-16] + mov r15, QWORD PTR [rcx+96] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+96], rdi + mov QWORD PTR [rcx+-32], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+56] + xor r9, QWORD PTR [rcx+-72] + xor r10, QWORD PTR [rcx] + xor rdx, QWORD PTR [rcx+72] + xor rax, QWORD PTR [rcx+-56] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+56], r11 + mov QWORD PTR [rcx+-72], r12 + mov QWORD PTR [rcx], r13 + mov QWORD PTR [rcx+72], r14 + mov QWORD PTR [rcx+-56], r15 + ; Round 4 + xor r11, rsi + xor r13, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-80] + xor r14, QWORD PTR [rcx+-64] + xor r12, QWORD PTR [rcx+-48] + xor r14, QWORD PTR [rcx+-40] + xor r11, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+-16] + xor r11, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx+8] + xor r12, QWORD PTR [rcx+16] + xor r13, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor r12, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+48] + xor r12, QWORD PTR [rcx+64] + xor r11, QWORD PTR [rcx+80] + xor r13, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+64] + mov r13, QWORD PTR [rcx+24] + mov r14, QWORD PTR [rcx+-16] + mov r15, QWORD PTR [rcx+-56] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-16], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-56], r14 + ; XOR in constant + xor rsi, 32907 + ; Row 1 + mov r11, QWORD PTR [rcx+-40] + mov r12, QWORD PTR [rcx+-80] + mov r13, QWORD PTR [rcx+80] + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+40], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+-40], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+16] + mov r12, QWORD PTR [rcx+-24] + mov r13, QWORD PTR [rcx+-64] + mov r14, QWORD PTR [rcx+96] + mov r15, QWORD PTR [rcx+56] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+96], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+56], rdi + mov QWORD PTR [rcx+16], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+32] + mov r12, QWORD PTR [rcx+-8] + mov r13, QWORD PTR [rcx+-48] + mov r14, QWORD PTR [rcx+-88] + mov r15, QWORD PTR [rcx+72] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+72], rdi + mov QWORD PTR [rcx+32], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+88] + xor r9, QWORD PTR [rcx+48] + xor r10, QWORD PTR [rcx+8] + xor rdx, QWORD PTR [rcx+-32] + xor rax, QWORD PTR [rcx+-72] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+88], r11 + mov QWORD PTR [rcx+48], r12 + mov QWORD PTR [rcx+8], r13 + mov QWORD PTR [rcx+-32], r14 + mov QWORD PTR [rcx+-72], r15 + ; Round 5 + xor r11, rsi + xor r14, QWORD PTR [rcx+-88] + xor r12, QWORD PTR [rcx+-80] + xor r13, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor r13, QWORD PTR [rcx+-48] + xor r11, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+-16] + xor r12, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx] + xor r11, QWORD PTR [rcx+16] + xor r13, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+32] + xor r14, QWORD PTR [rcx+40] + xor r15, QWORD PTR [rcx+56] + xor r12, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor r13, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-80] + mov r13, QWORD PTR [rcx+-64] + mov r14, QWORD PTR [rcx+-88] + mov r15, QWORD PTR [rcx+-72] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-88], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-72], r14 + ; XOR in constant + mov r15, 2147483649 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+-16] + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+16] + mov r14, QWORD PTR [rcx+-8] + mov r15, QWORD PTR [rcx+8] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+8], rdi + mov QWORD PTR [rcx+-16], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+64] + mov r12, QWORD PTR [rcx+80] + mov r13, QWORD PTR [rcx+96] + mov r14, QWORD PTR [rcx+72] + mov r15, QWORD PTR [rcx+88] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+96], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+72], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+88], rdi + mov QWORD PTR [rcx+64], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-56] + mov r12, QWORD PTR [rcx+-40] + mov r13, QWORD PTR [rcx+-24] + mov r14, QWORD PTR [rcx+-48] + mov r15, QWORD PTR [rcx+-32] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-32], rdi + mov QWORD PTR [rcx+-56], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+24] + xor r9, QWORD PTR [rcx+40] + xor r10, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+32] + xor rax, QWORD PTR [rcx+48] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+24], r11 + mov QWORD PTR [rcx+40], r12 + mov QWORD PTR [rcx+56], r13 + mov QWORD PTR [rcx+32], r14 + mov QWORD PTR [rcx+48], r15 + ; Round 6 + xor r11, rsi + xor r14, QWORD PTR [rcx+-88] + xor r12, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-72] + xor r13, QWORD PTR [rcx+-64] + xor r11, QWORD PTR [rcx+-56] + xor r14, QWORD PTR [rcx+-48] + xor r12, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+-24] + xor r11, QWORD PTR [rcx+-16] + xor r14, QWORD PTR [rcx+-8] + xor r12, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+8] + xor r13, QWORD PTR [rcx+16] + xor r11, QWORD PTR [rcx+64] + xor r14, QWORD PTR [rcx+72] + xor r12, QWORD PTR [rcx+80] + xor r15, QWORD PTR [rcx+88] + xor r13, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+96] + mov r14, QWORD PTR [rcx+-48] + mov r15, QWORD PTR [rcx+48] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+96], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-48], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+48], r14 + ; XOR in constant + mov r15, 9223372039002292353 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+-88] + mov r12, QWORD PTR [rcx+8] + mov r13, QWORD PTR [rcx+64] + mov r14, QWORD PTR [rcx+-40] + mov r15, QWORD PTR [rcx+56] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+56], rdi + mov QWORD PTR [rcx+-88], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-80] + mov r12, QWORD PTR [rcx+16] + mov r13, QWORD PTR [rcx+72] + mov r14, QWORD PTR [rcx+-32] + mov r15, QWORD PTR [rcx+24] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+24], rdi + mov QWORD PTR [rcx+-80], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-72] + mov r12, QWORD PTR [rcx+-16] + mov r13, QWORD PTR [rcx+80] + mov r14, QWORD PTR [rcx+-24] + mov r15, QWORD PTR [rcx+32] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+32], rdi + mov QWORD PTR [rcx+-72], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-64] + xor r9, QWORD PTR [rcx+-8] + xor r10, QWORD PTR [rcx+88] + xor rdx, QWORD PTR [rcx+-56] + xor rax, QWORD PTR [rcx+40] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-64], r11 + mov QWORD PTR [rcx+-8], r12 + mov QWORD PTR [rcx+88], r13 + mov QWORD PTR [rcx+-56], r14 + mov QWORD PTR [rcx+40], r15 + ; Round 7 + xor r11, rsi + xor r11, QWORD PTR [rcx+-88] + xor r11, QWORD PTR [rcx+-80] + xor r11, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-48] + xor r14, QWORD PTR [rcx+-40] + xor r14, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-24] + xor r12, QWORD PTR [rcx+-16] + xor r12, QWORD PTR [rcx] + xor r12, QWORD PTR [rcx+8] + xor r12, QWORD PTR [rcx+16] + xor r15, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+56] + xor r13, QWORD PTR [rcx+64] + xor r13, QWORD PTR [rcx+72] + xor r13, QWORD PTR [rcx+80] + xor r13, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+8] + mov r13, QWORD PTR [rcx+72] + mov r14, QWORD PTR [rcx+-24] + mov r15, QWORD PTR [rcx+40] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-24], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+40], r14 + ; XOR in constant + mov r15, 9223372036854808585 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+-48] + mov r12, QWORD PTR [rcx+56] + mov r13, QWORD PTR [rcx+-80] + mov r14, QWORD PTR [rcx+-16] + mov r15, QWORD PTR [rcx+88] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+88], rdi + mov QWORD PTR [rcx+-48], r14 + ; Row 2 + mov r11, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+64] + mov r13, QWORD PTR [rcx+-32] + mov r14, QWORD PTR [rcx+32] + mov r15, QWORD PTR [rcx+-64] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+32], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-64], rdi + mov QWORD PTR [rcx], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+48] + mov r12, QWORD PTR [rcx+-88] + mov r13, QWORD PTR [rcx+16] + mov r14, QWORD PTR [rcx+80] + mov r15, QWORD PTR [rcx+-56] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+80], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-56], rdi + mov QWORD PTR [rcx+48], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+96] + xor r9, QWORD PTR [rcx+-40] + xor r10, QWORD PTR [rcx+24] + xor rdx, QWORD PTR [rcx+-72] + xor rax, QWORD PTR [rcx+-8] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+96], r11 + mov QWORD PTR [rcx+-40], r12 + mov QWORD PTR [rcx+24], r13 + mov QWORD PTR [rcx+-72], r14 + mov QWORD PTR [rcx+-8], r15 + ; Round 8 + xor r11, rsi + xor r12, QWORD PTR [rcx+-88] + xor r13, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor r11, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+-16] + xor r11, QWORD PTR [rcx] + xor r12, QWORD PTR [rcx+8] + xor r13, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+48] + xor r12, QWORD PTR [rcx+56] + xor r12, QWORD PTR [rcx+64] + xor r13, QWORD PTR [rcx+72] + xor r14, QWORD PTR [rcx+80] + xor r15, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+56] + mov r13, QWORD PTR [rcx+-32] + mov r14, QWORD PTR [rcx+80] + mov r15, QWORD PTR [rcx+-8] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+80], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-8], r14 + ; XOR in constant + xor rsi, 138 + ; Row 1 + mov r11, QWORD PTR [rcx+-24] + mov r12, QWORD PTR [rcx+88] + mov r13, QWORD PTR [rcx] + mov r14, QWORD PTR [rcx+-88] + mov r15, QWORD PTR [rcx+24] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+24], rdi + mov QWORD PTR [rcx+-24], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+8] + mov r12, QWORD PTR [rcx+-80] + mov r13, QWORD PTR [rcx+32] + mov r14, QWORD PTR [rcx+-56] + mov r15, QWORD PTR [rcx+96] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+96], rdi + mov QWORD PTR [rcx+8], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+40] + mov r12, QWORD PTR [rcx+-48] + mov r13, QWORD PTR [rcx+64] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+-72] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+16], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-72], rdi + mov QWORD PTR [rcx+40], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+72] + xor r9, QWORD PTR [rcx+-16] + xor r10, QWORD PTR [rcx+-64] + xor rdx, QWORD PTR [rcx+48] + xor rax, QWORD PTR [rcx+-40] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+72], r11 + mov QWORD PTR [rcx+-16], r12 + mov QWORD PTR [rcx+-64], r13 + mov QWORD PTR [rcx+48], r14 + mov QWORD PTR [rcx+-40], r15 + ; Round 9 + xor r11, rsi + xor r14, QWORD PTR [rcx+-88] + xor r12, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-56] + xor r12, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+-32] + xor r11, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-8] + xor r13, QWORD PTR [rcx] + xor r11, QWORD PTR [rcx+8] + xor r14, QWORD PTR [rcx+16] + xor r15, QWORD PTR [rcx+24] + xor r13, QWORD PTR [rcx+32] + xor r11, QWORD PTR [rcx+40] + xor r12, QWORD PTR [rcx+56] + xor r13, QWORD PTR [rcx+64] + xor r14, QWORD PTR [rcx+80] + xor r12, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+88] + mov r13, QWORD PTR [rcx+32] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+-40] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+16], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-40], r14 + ; XOR in constant + xor rsi, 136 + ; Row 1 + mov r11, QWORD PTR [rcx+80] + mov r12, QWORD PTR [rcx+24] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx+-48] + mov r15, QWORD PTR [rcx+-64] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-64], rdi + mov QWORD PTR [rcx+80], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+56] + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+-56] + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+72] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+72], rdi + mov QWORD PTR [rcx+56], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-8] + mov r12, QWORD PTR [rcx+-24] + mov r13, QWORD PTR [rcx+-80] + mov r14, QWORD PTR [rcx+64] + mov r15, QWORD PTR [rcx+48] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+64], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+48], rdi + mov QWORD PTR [rcx+-8], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-32] + xor r9, QWORD PTR [rcx+-88] + xor r10, QWORD PTR [rcx+96] + xor rdx, QWORD PTR [rcx+40] + xor rax, QWORD PTR [rcx+-16] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-32], r11 + mov QWORD PTR [rcx+-88], r12 + mov QWORD PTR [rcx+96], r13 + mov QWORD PTR [rcx+40], r14 + mov QWORD PTR [rcx+-16], r15 + ; Round 10 + xor r11, rsi + xor r13, QWORD PTR [rcx+-80] + xor r14, QWORD PTR [rcx+-72] + xor r15, QWORD PTR [rcx+-64] + xor r13, QWORD PTR [rcx+-56] + xor r14, QWORD PTR [rcx+-48] + xor r15, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-24] + xor r11, QWORD PTR [rcx+-8] + xor r12, QWORD PTR [rcx] + xor r13, QWORD PTR [rcx+8] + xor r14, QWORD PTR [rcx+16] + xor r12, QWORD PTR [rcx+24] + xor r13, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+48] + xor r11, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor r11, QWORD PTR [rcx+80] + xor r12, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+24] + mov r13, QWORD PTR [rcx+-56] + mov r14, QWORD PTR [rcx+64] + mov r15, QWORD PTR [rcx+-16] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+64], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-16], r14 + ; XOR in constant + mov r15, 2147516425 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+16] + mov r12, QWORD PTR [rcx+-64] + mov r13, QWORD PTR [rcx+56] + mov r14, QWORD PTR [rcx+-24] + mov r15, QWORD PTR [rcx+96] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+96], rdi + mov QWORD PTR [rcx+16], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+88] + mov r12, QWORD PTR [rcx+8] + mov r13, QWORD PTR [rcx+-72] + mov r14, QWORD PTR [rcx+48] + mov r15, QWORD PTR [rcx+-32] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+48], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-32], rdi + mov QWORD PTR [rcx+88], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-40] + mov r12, QWORD PTR [rcx+80] + mov r13, QWORD PTR [rcx] + mov r14, QWORD PTR [rcx+-80] + mov r15, QWORD PTR [rcx+40] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+40], rdi + mov QWORD PTR [rcx+-40], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+32] + xor r9, QWORD PTR [rcx+-48] + xor r10, QWORD PTR [rcx+72] + xor rdx, QWORD PTR [rcx+-8] + xor rax, QWORD PTR [rcx+-88] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+32], r11 + mov QWORD PTR [rcx+-48], r12 + mov QWORD PTR [rcx+72], r13 + mov QWORD PTR [rcx+-8], r14 + mov QWORD PTR [rcx+-88], r15 + ; Round 11 + xor r11, rsi + xor r14, QWORD PTR [rcx+-80] + xor r13, QWORD PTR [rcx+-72] + xor r12, QWORD PTR [rcx+-64] + xor r13, QWORD PTR [rcx+-56] + xor r11, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-16] + xor r13, QWORD PTR [rcx] + xor r12, QWORD PTR [rcx+8] + xor r11, QWORD PTR [rcx+16] + xor r12, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+48] + xor r13, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+64] + xor r12, QWORD PTR [rcx+80] + xor r11, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-64] + mov r13, QWORD PTR [rcx+-72] + mov r14, QWORD PTR [rcx+-80] + mov r15, QWORD PTR [rcx+-88] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-80], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-88], r14 + ; XOR in constant + mov r15, 2147483658 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+64] + mov r12, QWORD PTR [rcx+96] + mov r13, QWORD PTR [rcx+88] + mov r14, QWORD PTR [rcx+80] + mov r15, QWORD PTR [rcx+72] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+96], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+80], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+72], rdi + mov QWORD PTR [rcx+64], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+24] + mov r12, QWORD PTR [rcx+56] + mov r13, QWORD PTR [rcx+48] + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx+32] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+40], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+32], rdi + mov QWORD PTR [rcx+24], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-16] + mov r12, QWORD PTR [rcx+16] + mov r13, QWORD PTR [rcx+8] + mov r14, QWORD PTR [rcx] + mov r15, QWORD PTR [rcx+-8] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-8], rdi + mov QWORD PTR [rcx+-16], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-56] + xor r9, QWORD PTR [rcx+-24] + xor r10, QWORD PTR [rcx+-32] + xor rdx, QWORD PTR [rcx+-40] + xor rax, QWORD PTR [rcx+-48] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-56], r11 + mov QWORD PTR [rcx+-24], r12 + mov QWORD PTR [rcx+-32], r13 + mov QWORD PTR [rcx+-40], r14 + mov QWORD PTR [rcx+-48], r15 + ; Round 12 + xor r11, rsi + xor r15, QWORD PTR [rcx+-88] + xor r14, QWORD PTR [rcx+-80] + xor r13, QWORD PTR [rcx+-72] + xor r12, QWORD PTR [rcx+-64] + xor r11, QWORD PTR [rcx+-16] + xor r15, QWORD PTR [rcx+-8] + xor r14, QWORD PTR [rcx] + xor r13, QWORD PTR [rcx+8] + xor r12, QWORD PTR [rcx+16] + xor r11, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor r14, QWORD PTR [rcx+40] + xor r13, QWORD PTR [rcx+48] + xor r12, QWORD PTR [rcx+56] + xor r11, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor r14, QWORD PTR [rcx+80] + xor r13, QWORD PTR [rcx+88] + xor r12, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+96] + mov r13, QWORD PTR [rcx+48] + mov r14, QWORD PTR [rcx] + mov r15, QWORD PTR [rcx+-48] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+96], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-48], r14 + ; XOR in constant + mov r15, 2147516555 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+-80] + mov r12, QWORD PTR [rcx+72] + mov r13, QWORD PTR [rcx+24] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+-32] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+16], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-32], rdi + mov QWORD PTR [rcx+-80], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-64] + mov r12, QWORD PTR [rcx+88] + mov r13, QWORD PTR [rcx+40] + mov r14, QWORD PTR [rcx+-8] + mov r15, QWORD PTR [rcx+-56] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-56], rdi + mov QWORD PTR [rcx+-64], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-88] + mov r12, QWORD PTR [rcx+64] + mov r13, QWORD PTR [rcx+56] + mov r14, QWORD PTR [rcx+8] + mov r15, QWORD PTR [rcx+-40] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+8], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-40], rdi + mov QWORD PTR [rcx+-88], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-72] + xor r9, QWORD PTR [rcx+80] + xor r10, QWORD PTR [rcx+32] + xor rdx, QWORD PTR [rcx+-16] + xor rax, QWORD PTR [rcx+-24] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-72], r11 + mov QWORD PTR [rcx+80], r12 + mov QWORD PTR [rcx+32], r13 + mov QWORD PTR [rcx+-16], r14 + mov QWORD PTR [rcx+-24], r15 + ; Round 13 + xor r11, rsi + xor r11, QWORD PTR [rcx+-88] + xor r11, QWORD PTR [rcx+-80] + xor r11, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor r15, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-8] + xor r14, QWORD PTR [rcx] + xor r14, QWORD PTR [rcx+8] + xor r14, QWORD PTR [rcx+16] + xor r13, QWORD PTR [rcx+24] + xor r13, QWORD PTR [rcx+40] + xor r13, QWORD PTR [rcx+48] + xor r13, QWORD PTR [rcx+56] + xor r12, QWORD PTR [rcx+64] + xor r12, QWORD PTR [rcx+72] + xor r12, QWORD PTR [rcx+88] + xor r12, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+72] + mov r13, QWORD PTR [rcx+40] + mov r14, QWORD PTR [rcx+8] + mov r15, QWORD PTR [rcx+-24] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+8], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-24], r14 + ; XOR in constant + mov r15, 9223372036854775947 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+-32] + mov r13, QWORD PTR [rcx+-64] + mov r14, QWORD PTR [rcx+64] + mov r15, QWORD PTR [rcx+32] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+64], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+32], rdi + mov QWORD PTR [rcx], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+96] + mov r12, QWORD PTR [rcx+24] + mov r13, QWORD PTR [rcx+-8] + mov r14, QWORD PTR [rcx+-40] + mov r15, QWORD PTR [rcx+-72] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-72], rdi + mov QWORD PTR [rcx+96], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-48] + mov r12, QWORD PTR [rcx+-80] + mov r13, QWORD PTR [rcx+88] + mov r14, QWORD PTR [rcx+56] + mov r15, QWORD PTR [rcx+-16] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+56], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-16], rdi + mov QWORD PTR [rcx+-48], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+48] + xor r9, QWORD PTR [rcx+16] + xor r10, QWORD PTR [rcx+-56] + xor rdx, QWORD PTR [rcx+-88] + xor rax, QWORD PTR [rcx+80] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+48], r11 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+-56], r13 + mov QWORD PTR [rcx+-88], r14 + mov QWORD PTR [rcx+80], r15 + ; Round 14 + xor r11, rsi + xor r12, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-72] + xor r13, QWORD PTR [rcx+-64] + xor r11, QWORD PTR [rcx+-48] + xor r14, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + xor r15, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-16] + xor r13, QWORD PTR [rcx+-8] + xor r11, QWORD PTR [rcx] + xor r14, QWORD PTR [rcx+8] + xor r12, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor r13, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+64] + xor r12, QWORD PTR [rcx+72] + xor r13, QWORD PTR [rcx+88] + xor r11, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-32] + mov r13, QWORD PTR [rcx+-8] + mov r14, QWORD PTR [rcx+56] + mov r15, QWORD PTR [rcx+80] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+56], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+80], r14 + ; XOR in constant + mov r15, 9223372036854808713 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+8] + mov r12, QWORD PTR [rcx+32] + mov r13, QWORD PTR [rcx+96] + mov r14, QWORD PTR [rcx+-80] + mov r15, QWORD PTR [rcx+-56] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+96], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-56], rdi + mov QWORD PTR [rcx+8], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+72] + mov r12, QWORD PTR [rcx+-64] + mov r13, QWORD PTR [rcx+-40] + mov r14, QWORD PTR [rcx+-16] + mov r15, QWORD PTR [rcx+48] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+48], rdi + mov QWORD PTR [rcx+72], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-24] + mov r12, QWORD PTR [rcx] + mov r13, QWORD PTR [rcx+24] + mov r14, QWORD PTR [rcx+88] + mov r15, QWORD PTR [rcx+-88] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+88], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-88], rdi + mov QWORD PTR [rcx+-24], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+64] + xor r10, QWORD PTR [rcx+-72] + xor rdx, QWORD PTR [rcx+-48] + xor rax, QWORD PTR [rcx+16] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+40], r11 + mov QWORD PTR [rcx+64], r12 + mov QWORD PTR [rcx+-72], r13 + mov QWORD PTR [rcx+-48], r14 + mov QWORD PTR [rcx+16], r15 + ; Round 15 + xor r11, rsi + xor r15, QWORD PTR [rcx+-88] + xor r14, QWORD PTR [rcx+-80] + xor r12, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor r13, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + xor r11, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+-16] + xor r13, QWORD PTR [rcx+-8] + xor r12, QWORD PTR [rcx] + xor r11, QWORD PTR [rcx+8] + xor r13, QWORD PTR [rcx+24] + xor r12, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+48] + xor r14, QWORD PTR [rcx+56] + xor r11, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+88] + xor r13, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+32] + mov r13, QWORD PTR [rcx+-40] + mov r14, QWORD PTR [rcx+88] + mov r15, QWORD PTR [rcx+16] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+88], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+16], r14 + ; XOR in constant + mov r15, 9223372036854808579 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+56] + mov r12, QWORD PTR [rcx+-56] + mov r13, QWORD PTR [rcx+72] + mov r14, QWORD PTR [rcx] + mov r15, QWORD PTR [rcx+-72] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-72], rdi + mov QWORD PTR [rcx+56], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-32] + mov r12, QWORD PTR [rcx+96] + mov r13, QWORD PTR [rcx+-16] + mov r14, QWORD PTR [rcx+-88] + mov r15, QWORD PTR [rcx+40] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+96], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+40], rdi + mov QWORD PTR [rcx+-32], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+80] + mov r12, QWORD PTR [rcx+8] + mov r13, QWORD PTR [rcx+-64] + mov r14, QWORD PTR [rcx+24] + mov r15, QWORD PTR [rcx+-48] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+24], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-48], rdi + mov QWORD PTR [rcx+80], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-8] + xor r9, QWORD PTR [rcx+-80] + xor r10, QWORD PTR [rcx+48] + xor rdx, QWORD PTR [rcx+-24] + xor rax, QWORD PTR [rcx+64] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-8], r11 + mov QWORD PTR [rcx+-80], r12 + mov QWORD PTR [rcx+48], r13 + mov QWORD PTR [rcx+-24], r14 + mov QWORD PTR [rcx+64], r15 + ; Round 16 + xor r11, rsi + xor r14, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-72] + xor r13, QWORD PTR [rcx+-64] + xor r12, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+-40] + xor r11, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+-16] + xor r14, QWORD PTR [rcx] + xor r12, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+24] + xor r12, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+56] + xor r13, QWORD PTR [rcx+72] + xor r11, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+88] + xor r12, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-56] + mov r13, QWORD PTR [rcx+-16] + mov r14, QWORD PTR [rcx+24] + mov r15, QWORD PTR [rcx+64] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+24], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+64], r14 + ; XOR in constant + mov r15, 9223372036854808578 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+88] + mov r12, QWORD PTR [rcx+-72] + mov r13, QWORD PTR [rcx+-32] + mov r14, QWORD PTR [rcx+8] + mov r15, QWORD PTR [rcx+48] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+8], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+48], rdi + mov QWORD PTR [rcx+88], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+32] + mov r12, QWORD PTR [rcx+72] + mov r13, QWORD PTR [rcx+-88] + mov r14, QWORD PTR [rcx+-48] + mov r15, QWORD PTR [rcx+-8] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-8], rdi + mov QWORD PTR [rcx+32], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+16] + mov r12, QWORD PTR [rcx+56] + mov r13, QWORD PTR [rcx+96] + mov r14, QWORD PTR [rcx+-64] + mov r15, QWORD PTR [rcx+-24] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+96], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-24], rdi + mov QWORD PTR [rcx+16], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-40] + xor r9, QWORD PTR [rcx] + xor r10, QWORD PTR [rcx+40] + xor rdx, QWORD PTR [rcx+80] + xor rax, QWORD PTR [rcx+-80] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-40], r11 + mov QWORD PTR [rcx], r12 + mov QWORD PTR [rcx+40], r13 + mov QWORD PTR [rcx+80], r14 + mov QWORD PTR [rcx+-80], r15 + ; Round 17 + xor r11, rsi + xor r13, QWORD PTR [rcx+-88] + xor r12, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-64] + xor r12, QWORD PTR [rcx+-56] + xor r14, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+-32] + xor r15, QWORD PTR [rcx+-24] + xor r13, QWORD PTR [rcx+-16] + xor r15, QWORD PTR [rcx+-8] + xor r14, QWORD PTR [rcx+8] + xor r11, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+48] + xor r12, QWORD PTR [rcx+56] + xor r15, QWORD PTR [rcx+64] + xor r12, QWORD PTR [rcx+72] + xor r11, QWORD PTR [rcx+88] + xor r13, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-72] + mov r13, QWORD PTR [rcx+-88] + mov r14, QWORD PTR [rcx+-64] + mov r15, QWORD PTR [rcx+-80] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-64], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-80], r14 + ; XOR in constant + mov r15, 9223372036854775936 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+24] + mov r12, QWORD PTR [rcx+48] + mov r13, QWORD PTR [rcx+32] + mov r14, QWORD PTR [rcx+56] + mov r15, QWORD PTR [rcx+40] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+56], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+40], rdi + mov QWORD PTR [rcx+24], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-56] + mov r12, QWORD PTR [rcx+-32] + mov r13, QWORD PTR [rcx+-48] + mov r14, QWORD PTR [rcx+-24] + mov r15, QWORD PTR [rcx+-40] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-40], rdi + mov QWORD PTR [rcx+-56], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+64] + mov r12, QWORD PTR [rcx+88] + mov r13, QWORD PTR [rcx+72] + mov r14, QWORD PTR [rcx+96] + mov r15, QWORD PTR [rcx+80] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+96], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+80], rdi + mov QWORD PTR [rcx+64], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-16] + xor r9, QWORD PTR [rcx+8] + xor r10, QWORD PTR [rcx+-8] + xor rdx, QWORD PTR [rcx+16] + xor rax, QWORD PTR [rcx] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-16], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+-8], r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx], r15 + ; Round 18 + xor r11, rsi + xor r13, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-80] + xor r12, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-64] + xor r11, QWORD PTR [rcx+-56] + xor r13, QWORD PTR [rcx+-48] + xor r15, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-24] + xor r11, QWORD PTR [rcx+24] + xor r13, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+40] + xor r12, QWORD PTR [rcx+48] + xor r14, QWORD PTR [rcx+56] + xor r11, QWORD PTR [rcx+64] + xor r13, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+80] + xor r12, QWORD PTR [rcx+88] + xor r14, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+48] + mov r13, QWORD PTR [rcx+-48] + mov r14, QWORD PTR [rcx+96] + mov r15, QWORD PTR [rcx] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+96], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx], r14 + ; XOR in constant + xor rsi, 32778 + ; Row 1 + mov r11, QWORD PTR [rcx+-64] + mov r12, QWORD PTR [rcx+40] + mov r13, QWORD PTR [rcx+-56] + mov r14, QWORD PTR [rcx+88] + mov r15, QWORD PTR [rcx+-8] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+88], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-8], rdi + mov QWORD PTR [rcx+-64], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-72] + mov r12, QWORD PTR [rcx+32] + mov r13, QWORD PTR [rcx+-24] + mov r14, QWORD PTR [rcx+80] + mov r15, QWORD PTR [rcx+-16] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+80], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-16], rdi + mov QWORD PTR [rcx+-72], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+-80] + mov r12, QWORD PTR [rcx+24] + mov r13, QWORD PTR [rcx+-32] + mov r14, QWORD PTR [rcx+72] + mov r15, QWORD PTR [rcx+16] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+24], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+72], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+16], rdi + mov QWORD PTR [rcx+-80], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-88] + xor r9, QWORD PTR [rcx+56] + xor r10, QWORD PTR [rcx+-40] + xor rdx, QWORD PTR [rcx+64] + xor rax, QWORD PTR [rcx+8] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-88], r11 + mov QWORD PTR [rcx+56], r12 + mov QWORD PTR [rcx+-40], r13 + mov QWORD PTR [rcx+64], r14 + mov QWORD PTR [rcx+8], r15 + ; Round 19 + xor r11, rsi + xor r11, QWORD PTR [rcx+-80] + xor r11, QWORD PTR [rcx+-72] + xor r11, QWORD PTR [rcx+-64] + xor r13, QWORD PTR [rcx+-56] + xor r13, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-16] + xor r15, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+16] + xor r12, QWORD PTR [rcx+24] + xor r12, QWORD PTR [rcx+32] + xor r12, QWORD PTR [rcx+40] + xor r12, QWORD PTR [rcx+48] + xor r14, QWORD PTR [rcx+72] + xor r14, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+88] + xor r14, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+40] + mov r13, QWORD PTR [rcx+-24] + mov r14, QWORD PTR [rcx+72] + mov r15, QWORD PTR [rcx+8] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-24], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+72], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+8], r14 + ; XOR in constant + mov r15, 9223372039002259466 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+96] + mov r12, QWORD PTR [rcx+-8] + mov r13, QWORD PTR [rcx+-72] + mov r14, QWORD PTR [rcx+24] + mov r15, QWORD PTR [rcx+-40] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+24], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-40], rdi + mov QWORD PTR [rcx+96], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+48] + mov r12, QWORD PTR [rcx+-56] + mov r13, QWORD PTR [rcx+80] + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+-88] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+16], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-88], rdi + mov QWORD PTR [rcx+48], r14 + ; Row 3 + mov r11, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+-64] + mov r13, QWORD PTR [rcx+32] + mov r14, QWORD PTR [rcx+-32] + mov r15, QWORD PTR [rcx+64] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+32], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+64], rdi + mov QWORD PTR [rcx], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-48] + xor r9, QWORD PTR [rcx+88] + xor r10, QWORD PTR [rcx+-16] + xor rdx, QWORD PTR [rcx+-80] + xor rax, QWORD PTR [rcx+56] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-48], r11 + mov QWORD PTR [rcx+88], r12 + mov QWORD PTR [rcx+-16], r13 + mov QWORD PTR [rcx+-80], r14 + mov QWORD PTR [rcx+56], r15 + ; Round 20 + xor r11, rsi + xor r15, QWORD PTR [rcx+-88] + xor r13, QWORD PTR [rcx+-72] + xor r12, QWORD PTR [rcx+-64] + xor r12, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-40] + xor r14, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+-24] + xor r12, QWORD PTR [rcx+-8] + xor r11, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+8] + xor r14, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+24] + xor r13, QWORD PTR [rcx+32] + xor r12, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+64] + xor r14, QWORD PTR [rcx+72] + xor r13, QWORD PTR [rcx+80] + xor r11, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-8] + mov r13, QWORD PTR [rcx+80] + mov r14, QWORD PTR [rcx+-32] + mov r15, QWORD PTR [rcx+56] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-32], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+56], r14 + ; XOR in constant + mov r15, 9223372039002292353 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+72] + mov r12, QWORD PTR [rcx+-40] + mov r13, QWORD PTR [rcx+48] + mov r14, QWORD PTR [rcx+-64] + mov r15, QWORD PTR [rcx+-16] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-64], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-16], rdi + mov QWORD PTR [rcx+72], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+40] + mov r12, QWORD PTR [rcx+-72] + mov r13, QWORD PTR [rcx+16] + mov r14, QWORD PTR [rcx+64] + mov r15, QWORD PTR [rcx+-48] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+64], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-48], rdi + mov QWORD PTR [rcx+40], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+8] + mov r12, QWORD PTR [rcx+96] + mov r13, QWORD PTR [rcx+-56] + mov r14, QWORD PTR [rcx+32] + mov r15, QWORD PTR [rcx+-80] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+96], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+32], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+8], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+-24] + xor r9, QWORD PTR [rcx+24] + xor r10, QWORD PTR [rcx+-88] + xor rdx, QWORD PTR [rcx] + xor rax, QWORD PTR [rcx+88] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+-24], r11 + mov QWORD PTR [rcx+24], r12 + mov QWORD PTR [rcx+-88], r13 + mov QWORD PTR [rcx], r14 + mov QWORD PTR [rcx+88], r15 + ; Round 21 + xor r11, rsi + xor r15, QWORD PTR [rcx+-80] + xor r12, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-64] + xor r13, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor r12, QWORD PTR [rcx+-40] + xor r14, QWORD PTR [rcx+-32] + xor r15, QWORD PTR [rcx+-16] + xor r12, QWORD PTR [rcx+-8] + xor r11, QWORD PTR [rcx+8] + xor r13, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+32] + xor r11, QWORD PTR [rcx+40] + xor r13, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+64] + xor r11, QWORD PTR [rcx+72] + xor r13, QWORD PTR [rcx+80] + xor r12, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-40] + mov r13, QWORD PTR [rcx+16] + mov r14, QWORD PTR [rcx+32] + mov r15, QWORD PTR [rcx+88] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+16], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+32], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+88], r14 + ; XOR in constant + mov r15, 9223372036854808704 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+-32] + mov r12, QWORD PTR [rcx+-16] + mov r13, QWORD PTR [rcx+40] + mov r14, QWORD PTR [rcx+96] + mov r15, QWORD PTR [rcx+-88] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+96], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-88], rdi + mov QWORD PTR [rcx+-32], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-8] + mov r12, QWORD PTR [rcx+48] + mov r13, QWORD PTR [rcx+64] + mov r14, QWORD PTR [rcx+-80] + mov r15, QWORD PTR [rcx+-24] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-24], rdi + mov QWORD PTR [rcx+-8], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+56] + mov r12, QWORD PTR [rcx+72] + mov r13, QWORD PTR [rcx+-72] + mov r14, QWORD PTR [rcx+-56] + mov r15, QWORD PTR [rcx] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+72], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-56], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+56], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+80] + xor r9, QWORD PTR [rcx+-64] + xor r10, QWORD PTR [rcx+-48] + xor rdx, QWORD PTR [rcx+8] + xor rax, QWORD PTR [rcx+24] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+80], r11 + mov QWORD PTR [rcx+-64], r12 + mov QWORD PTR [rcx+-48], r13 + mov QWORD PTR [rcx+8], r14 + mov QWORD PTR [rcx+24], r15 + ; Round 22 + xor r11, rsi + xor r15, QWORD PTR [rcx+-88] + xor r14, QWORD PTR [rcx+-80] + xor r13, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-56] + xor r12, QWORD PTR [rcx+-40] + xor r11, QWORD PTR [rcx+-32] + xor r15, QWORD PTR [rcx+-24] + xor r12, QWORD PTR [rcx+-16] + xor r11, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx] + xor r13, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+32] + xor r13, QWORD PTR [rcx+40] + xor r12, QWORD PTR [rcx+48] + xor r11, QWORD PTR [rcx+56] + xor r13, QWORD PTR [rcx+64] + xor r12, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+88] + xor r14, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-16] + mov r13, QWORD PTR [rcx+64] + mov r14, QWORD PTR [rcx+-56] + mov r15, QWORD PTR [rcx+24] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-16], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+64], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-56], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+24], r14 + ; XOR in constant + mov r15, 2147483649 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+32] + mov r12, QWORD PTR [rcx+-88] + mov r13, QWORD PTR [rcx+-8] + mov r14, QWORD PTR [rcx+72] + mov r15, QWORD PTR [rcx+-48] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+72], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-48], rdi + mov QWORD PTR [rcx+32], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-40] + mov r12, QWORD PTR [rcx+40] + mov r13, QWORD PTR [rcx+-80] + mov r14, QWORD PTR [rcx] + mov r15, QWORD PTR [rcx+80] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+40], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+80], rdi + mov QWORD PTR [rcx+-40], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+88] + mov r12, QWORD PTR [rcx+-32] + mov r13, QWORD PTR [rcx+48] + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+8] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+48], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-72], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+8], rdi + mov QWORD PTR [rcx+88], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+16] + xor r9, QWORD PTR [rcx+96] + xor r10, QWORD PTR [rcx+-24] + xor rdx, QWORD PTR [rcx+56] + xor rax, QWORD PTR [rcx+-64] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+16], r11 + mov QWORD PTR [rcx+96], r12 + mov QWORD PTR [rcx+-24], r13 + mov QWORD PTR [rcx+56], r14 + mov QWORD PTR [rcx+-64], r15 + ; Round 23 + xor r11, rsi + xor r12, QWORD PTR [rcx+-88] + xor r13, QWORD PTR [rcx+-80] + xor r14, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor r11, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + xor r12, QWORD PTR [rcx+-16] + xor r13, QWORD PTR [rcx+-8] + xor r14, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+32] + xor r12, QWORD PTR [rcx+40] + xor r13, QWORD PTR [rcx+48] + xor r13, QWORD PTR [rcx+64] + xor r14, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+80] + xor r11, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rdx, r12, 63 + rorx rax, r13, 63 + rorx r8, r14, 63 + rorx r9, r15, 63 + rorx r10, r11, 63 + xor rdx, r15 + xor rax, r11 + xor r8, r12 + xor r9, r13 + xor r10, r14 + ; Row Mix + ; Row 0 + mov r11, rsi + mov r12, QWORD PTR [rcx+-88] + mov r13, QWORD PTR [rcx+-80] + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+-64] + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + rol r12, 44 + rol r13, 43 + rol r14, 21 + rol r15, 14 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-88], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-80], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-72], rdi + andn r14, r11, r12 + andn rsi, r12, r13 + xor r14, r15 + xor rsi, r11 + mov QWORD PTR [rcx+-64], r14 + ; XOR in constant + mov r15, 9223372039002292232 + xor rsi, r15 + ; Row 1 + mov r11, QWORD PTR [rcx+-56] + mov r12, QWORD PTR [rcx+-48] + mov r13, QWORD PTR [rcx+-40] + mov r14, QWORD PTR [rcx+-32] + mov r15, QWORD PTR [rcx+-24] + xor r11, r9 + xor r12, r10 + xor r13, rdx + xor r14, rax + xor r15, r8 + rol r11, 28 + rol r12, 20 + rol r13, 3 + rol r14, 45 + rol r15, 61 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-48], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+-40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+-32], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+-24], rdi + mov QWORD PTR [rcx+-56], r14 + ; Row 2 + mov r11, QWORD PTR [rcx+-16] + mov r12, QWORD PTR [rcx+-8] + mov r13, QWORD PTR [rcx] + mov r14, QWORD PTR [rcx+8] + mov r15, QWORD PTR [rcx+16] + xor r11, rax + xor r12, r8 + xor r13, r9 + xor r14, r10 + xor r15, rdx + rol r11, 1 + rol r12, 6 + rol r13, 25 + rol r14, 8 + rol r15, 18 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+-8], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+8], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+16], rdi + mov QWORD PTR [rcx+-16], r14 + ; Row 3 + mov r11, QWORD PTR [rcx+24] + mov r12, QWORD PTR [rcx+32] + mov r13, QWORD PTR [rcx+40] + mov r14, QWORD PTR [rcx+48] + mov r15, QWORD PTR [rcx+56] + xor r11, r10 + xor r12, rdx + xor r13, rax + xor r14, r8 + xor r15, r9 + rol r11, 27 + rol r12, 36 + rol r13, 10 + rol r14, 15 + rol r15, 56 + andn rdi, r13, r14 + xor rdi, r12 + mov QWORD PTR [rcx+32], rdi + andn rdi, r14, r15 + xor rdi, r13 + mov QWORD PTR [rcx+40], rdi + andn rdi, r15, r11 + xor rdi, r14 + mov QWORD PTR [rcx+48], rdi + andn rdi, r11, r12 + andn r14, r12, r13 + xor rdi, r15 + xor r14, r11 + mov QWORD PTR [rcx+56], rdi + mov QWORD PTR [rcx+24], r14 + ; Row 4 + xor r8, QWORD PTR [rcx+64] + xor r9, QWORD PTR [rcx+72] + xor r10, QWORD PTR [rcx+80] + xor rdx, QWORD PTR [rcx+88] + xor rax, QWORD PTR [rcx+96] + rorx r11, r8, 2 + rorx r12, r9, 9 + rorx r13, r10, 25 + rorx r14, rdx, 23 + rorx r15, rax, 62 + andn rdx, r12, r13 + andn rax, r13, r14 + andn r8, r14, r15 + andn r9, r15, r11 + andn r10, r11, r12 + xor r11, rdx + xor r12, rax + xor r13, r8 + xor r14, r9 + xor r15, r10 + mov QWORD PTR [rcx+64], r11 + mov QWORD PTR [rcx+72], r12 + mov QWORD PTR [rcx+80], r13 + mov QWORD PTR [rcx+88], r14 + mov QWORD PTR [rcx+96], r15 + mov QWORD PTR [rcx+-96], rsi + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sha3_block_bmi2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +sha3_block_n_bmi2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + push rbp + push r9 + mov rbp, r9 + mov r9, QWORD PTR [rcx] + add rcx, 96 +L_sha3_block_n_bmi2_start: + cmp rbp, 136 + je L_sha3_block_n_bmi2_load_256 + cmp rbp, 168 + je L_sha3_block_n_bmi2_load_128 + cmp rbp, 144 + je L_sha3_block_n_bmi2_load_224 + cmp rbp, 104 + je L_sha3_block_n_bmi2_load_384 + mov r14, QWORD PTR [rdx] + mov r15, QWORD PTR [rdx+8] + mov rdi, QWORD PTR [rdx+16] + mov rsi, QWORD PTR [rdx+24] + mov rbx, QWORD PTR [rdx+32] + mov rax, QWORD PTR [rdx+40] + mov r10, QWORD PTR [rdx+48] + mov r11, QWORD PTR [rdx+56] + mov r12, QWORD PTR [rdx+64] + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-72] + xor rbx, QWORD PTR [rcx+-64] + xor rax, QWORD PTR [rcx+-56] + xor r10, QWORD PTR [rcx+-48] + xor r11, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + mov r9, r14 + mov QWORD PTR [rcx+-88], r15 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+-72], rsi + mov QWORD PTR [rcx+-64], rbx + mov QWORD PTR [rcx+-56], rax + mov QWORD PTR [rcx+-48], r10 + mov QWORD PTR [rcx+-40], r11 + mov QWORD PTR [rcx+-32], r12 + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_128: + mov r14, QWORD PTR [rdx] + mov r15, QWORD PTR [rdx+8] + mov rdi, QWORD PTR [rdx+16] + mov rsi, QWORD PTR [rdx+24] + mov rbx, QWORD PTR [rdx+32] + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-72] + xor rbx, QWORD PTR [rcx+-64] + mov r9, r14 + mov QWORD PTR [rcx+-88], r15 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+-72], rsi + mov QWORD PTR [rcx+-64], rbx + mov rax, QWORD PTR [rdx+40] + mov r10, QWORD PTR [rdx+48] + mov r11, QWORD PTR [rdx+56] + mov r12, QWORD PTR [rdx+64] + mov r13, QWORD PTR [rdx+72] + mov rbp, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rcx+-56] + xor r10, QWORD PTR [rcx+-48] + xor r11, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+-24] + xor rbp, QWORD PTR [rcx+-16] + mov QWORD PTR [rcx+-56], rax + mov QWORD PTR [rcx+-48], r10 + mov QWORD PTR [rcx+-40], r11 + mov QWORD PTR [rcx+-32], r12 + mov QWORD PTR [rcx+-24], r13 + mov QWORD PTR [rcx+-16], rbp + mov rax, QWORD PTR [rdx+88] + mov r10, QWORD PTR [rdx+96] + mov r11, QWORD PTR [rdx+104] + mov r12, QWORD PTR [rdx+112] + mov r13, QWORD PTR [rdx+120] + mov rbp, QWORD PTR [rdx+128] + xor rax, QWORD PTR [rcx+-8] + xor r10, QWORD PTR [rcx] + xor r11, QWORD PTR [rcx+8] + xor r12, QWORD PTR [rcx+16] + xor r13, QWORD PTR [rcx+24] + xor rbp, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+-8], rax + mov QWORD PTR [rcx], r10 + mov QWORD PTR [rcx+8], r11 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + mov QWORD PTR [rcx+32], rbp + mov rax, QWORD PTR [rdx+136] + mov r10, QWORD PTR [rdx+144] + mov r11, QWORD PTR [rdx+152] + mov r12, QWORD PTR [rdx+160] + xor rax, QWORD PTR [rcx+40] + xor r10, QWORD PTR [rcx+48] + xor r11, QWORD PTR [rcx+56] + xor r12, QWORD PTR [rcx+64] + mov QWORD PTR [rcx+40], rax + mov QWORD PTR [rcx+48], r10 + mov QWORD PTR [rcx+56], r11 + mov QWORD PTR [rcx+64], r12 + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_224: + mov r14, QWORD PTR [rdx+40] + mov r15, QWORD PTR [rdx+48] + mov rdi, QWORD PTR [rdx+56] + mov rsi, QWORD PTR [rdx+64] + mov rbx, QWORD PTR [rdx+72] + mov rax, QWORD PTR [rdx+80] + mov r10, QWORD PTR [rdx+88] + mov r11, QWORD PTR [rdx+96] + mov r12, QWORD PTR [rdx+104] + mov r13, QWORD PTR [rdx+112] + xor r14, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-40] + xor rsi, QWORD PTR [rcx+-32] + xor rbx, QWORD PTR [rcx+-24] + xor rax, QWORD PTR [rcx+-16] + xor r10, QWORD PTR [rcx+-8] + xor r11, QWORD PTR [rcx] + xor r12, QWORD PTR [rcx+8] + xor r13, QWORD PTR [rcx+16] + mov QWORD PTR [rcx+-56], r14 + mov QWORD PTR [rcx+-48], r15 + mov QWORD PTR [rcx+-40], rdi + mov QWORD PTR [rcx+-32], rsi + mov QWORD PTR [rcx+-24], rbx + mov QWORD PTR [rcx+-16], rax + mov QWORD PTR [rcx+-8], r10 + mov QWORD PTR [rcx], r11 + mov QWORD PTR [rcx+8], r12 + mov QWORD PTR [rcx+16], r13 + mov r14, QWORD PTR [rdx] + mov r15, QWORD PTR [rdx+8] + mov rdi, QWORD PTR [rdx+16] + mov rsi, QWORD PTR [rdx+24] + mov rbx, QWORD PTR [rdx+32] + mov rax, QWORD PTR [rdx+120] + mov r10, QWORD PTR [rdx+128] + mov r11, QWORD PTR [rdx+136] + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-72] + xor rbx, QWORD PTR [rcx+-64] + xor rax, QWORD PTR [rcx+24] + xor r10, QWORD PTR [rcx+32] + xor r11, QWORD PTR [rcx+40] + mov r9, r14 + mov QWORD PTR [rcx+-88], r15 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+-72], rsi + mov QWORD PTR [rcx+-64], rbx + mov QWORD PTR [rcx+24], rax + mov QWORD PTR [rcx+32], r10 + mov QWORD PTR [rcx+40], r11 + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_384: + mov r14, QWORD PTR [rdx] + mov r15, QWORD PTR [rdx+8] + mov rdi, QWORD PTR [rdx+16] + mov rsi, QWORD PTR [rdx+24] + mov rbx, QWORD PTR [rdx+32] + mov rax, QWORD PTR [rdx+40] + mov r10, QWORD PTR [rdx+48] + mov r11, QWORD PTR [rdx+56] + mov r12, QWORD PTR [rdx+64] + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-72] + xor rbx, QWORD PTR [rcx+-64] + xor rax, QWORD PTR [rcx+-56] + xor r10, QWORD PTR [rcx+-48] + xor r11, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + mov r9, r14 + mov QWORD PTR [rcx+-88], r15 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+-72], rsi + mov QWORD PTR [rcx+-64], rbx + mov QWORD PTR [rcx+-56], rax + mov QWORD PTR [rcx+-48], r10 + mov QWORD PTR [rcx+-40], r11 + mov QWORD PTR [rcx+-32], r12 + mov rax, QWORD PTR [rdx+72] + mov r10, QWORD PTR [rdx+80] + mov r11, QWORD PTR [rdx+88] + mov r12, QWORD PTR [rdx+96] + xor rax, QWORD PTR [rcx+-24] + xor r10, QWORD PTR [rcx+-16] + xor r11, QWORD PTR [rcx+-8] + xor r12, QWORD PTR [rcx] + mov QWORD PTR [rcx+-24], rax + mov QWORD PTR [rcx+-16], r10 + mov QWORD PTR [rcx+-8], r11 + mov QWORD PTR [rcx], r12 + jmp L_sha3_block_n_bmi2_rounds +L_sha3_block_n_bmi2_load_256: + mov r14, QWORD PTR [rdx] + mov r15, QWORD PTR [rdx+8] + mov rdi, QWORD PTR [rdx+16] + mov rsi, QWORD PTR [rdx+24] + mov rbx, QWORD PTR [rdx+32] + mov rax, QWORD PTR [rdx+40] + mov r10, QWORD PTR [rdx+48] + mov r11, QWORD PTR [rdx+56] + mov r12, QWORD PTR [rdx+64] + mov r13, QWORD PTR [rdx+72] + mov rbp, QWORD PTR [rdx+80] + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-72] + xor rbx, QWORD PTR [rcx+-64] + xor rax, QWORD PTR [rcx+-56] + xor r10, QWORD PTR [rcx+-48] + xor r11, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+-24] + xor rbp, QWORD PTR [rcx+-16] + mov r9, r14 + mov QWORD PTR [rcx+-88], r15 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+-72], rsi + mov QWORD PTR [rcx+-64], rbx + mov QWORD PTR [rcx+-56], rax + mov QWORD PTR [rcx+-48], r10 + mov QWORD PTR [rcx+-40], r11 + mov QWORD PTR [rcx+-32], r12 + mov QWORD PTR [rcx+-24], r13 + mov QWORD PTR [rcx+-16], rbp + mov rax, QWORD PTR [rdx+88] + mov r10, QWORD PTR [rdx+96] + mov r11, QWORD PTR [rdx+104] + mov r12, QWORD PTR [rdx+112] + mov r13, QWORD PTR [rdx+120] + mov rbp, QWORD PTR [rdx+128] + xor rax, QWORD PTR [rcx+-8] + xor r10, QWORD PTR [rcx] + xor r11, QWORD PTR [rcx+8] + xor r12, QWORD PTR [rcx+16] + xor r13, QWORD PTR [rcx+24] + xor rbp, QWORD PTR [rcx+32] + mov QWORD PTR [rcx+-8], rax + mov QWORD PTR [rcx], r10 + mov QWORD PTR [rcx+8], r11 + mov QWORD PTR [rcx+16], r12 + mov QWORD PTR [rcx+24], r13 + mov QWORD PTR [rcx+32], rbp +L_sha3_block_n_bmi2_rounds: + ; Round 0 + xor r14, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-40] + xor rsi, QWORD PTR [rcx+-32] + xor rbx, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+-16] + xor r15, QWORD PTR [rcx+-8] + xor rdi, QWORD PTR [rcx] + xor rsi, QWORD PTR [rcx+8] + xor rbx, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor rdi, QWORD PTR [rcx+40] + xor rsi, QWORD PTR [rcx+48] + xor rbx, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor rdi, QWORD PTR [rcx+80] + xor rsi, QWORD PTR [rcx+88] + xor rbx, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-48] + mov rdi, QWORD PTR [rcx] + mov rsi, QWORD PTR [rcx+48] + mov rbx, QWORD PTR [rcx+96] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+48], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+96], rsi + ; XOR in constant + xor r9, 1 + ; Row 1 + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+-24] + mov rdi, QWORD PTR [rcx+-16] + mov rsi, QWORD PTR [rcx+32] + mov rbx, QWORD PTR [rcx+80] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+32], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+80], rbp + mov QWORD PTR [rcx+-72], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-88] + mov r15, QWORD PTR [rcx+-40] + mov rdi, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rcx+56] + mov rbx, QWORD PTR [rcx+64] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+56], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+64], rbp + mov QWORD PTR [rcx+-88], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-64] + mov r15, QWORD PTR [rcx+-56] + mov rdi, QWORD PTR [rcx+-8] + mov rsi, QWORD PTR [rcx+40] + mov rbx, QWORD PTR [rcx+88] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+40], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+88], rbp + mov QWORD PTR [rcx+-64], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-80] + xor r12, QWORD PTR [rcx+-32] + xor r13, QWORD PTR [rcx+16] + xor rax, QWORD PTR [rcx+24] + xor r10, QWORD PTR [rcx+72] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-80], r14 + mov QWORD PTR [rcx+-32], r15 + mov QWORD PTR [rcx+16], rdi + mov QWORD PTR [rcx+24], rsi + mov QWORD PTR [rcx+72], rbx + ; Round 1 + xor r14, r9 + xor r14, QWORD PTR [rcx+-88] + xor r14, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor r15, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-24] + xor rdi, QWORD PTR [rcx+-16] + xor rdi, QWORD PTR [rcx+-8] + xor rdi, QWORD PTR [rcx] + xor rdi, QWORD PTR [rcx+8] + xor rsi, QWORD PTR [rcx+32] + xor rsi, QWORD PTR [rcx+40] + xor rsi, QWORD PTR [rcx+48] + xor rsi, QWORD PTR [rcx+56] + xor rbx, QWORD PTR [rcx+64] + xor rbx, QWORD PTR [rcx+80] + xor rbx, QWORD PTR [rcx+88] + xor rbx, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-24] + mov rdi, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rcx+40] + mov rbx, QWORD PTR [rcx+72] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+40], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+72], rsi + ; XOR in constant + xor r9, 32898 + ; Row 1 + mov r14, QWORD PTR [rcx+48] + mov r15, QWORD PTR [rcx+80] + mov rdi, QWORD PTR [rcx+-88] + mov rsi, QWORD PTR [rcx+-56] + mov rbx, QWORD PTR [rcx+16] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-56], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+16], rbp + mov QWORD PTR [rcx+48], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-48] + mov r15, QWORD PTR [rcx+-16] + mov rdi, QWORD PTR [rcx+56] + mov rsi, QWORD PTR [rcx+88] + mov rbx, QWORD PTR [rcx+-80] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+88], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-80], rbp + mov QWORD PTR [rcx+-48], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+96] + mov r15, QWORD PTR [rcx+-72] + mov rdi, QWORD PTR [rcx+-40] + mov rsi, QWORD PTR [rcx+-8] + mov rbx, QWORD PTR [rcx+24] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-8], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+24], rbp + mov QWORD PTR [rcx+96], rsi + ; Row 4 + xor r11, QWORD PTR [rcx] + xor r12, QWORD PTR [rcx+32] + xor r13, QWORD PTR [rcx+64] + xor rax, QWORD PTR [rcx+-64] + xor r10, QWORD PTR [rcx+-32] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx], r14 + mov QWORD PTR [rcx+32], r15 + mov QWORD PTR [rcx+64], rdi + mov QWORD PTR [rcx+-64], rsi + mov QWORD PTR [rcx+-32], rbx + ; Round 2 + xor r14, r9 + xor rdi, QWORD PTR [rcx+-88] + xor rbx, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-56] + xor r14, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-16] + xor rsi, QWORD PTR [rcx+-8] + xor rdi, QWORD PTR [rcx+8] + xor rbx, QWORD PTR [rcx+16] + xor rbx, QWORD PTR [rcx+24] + xor rsi, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+48] + xor rdi, QWORD PTR [rcx+56] + xor rbx, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+80] + xor rsi, QWORD PTR [rcx+88] + xor r14, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+80] + mov rdi, QWORD PTR [rcx+56] + mov rsi, QWORD PTR [rcx+-8] + mov rbx, QWORD PTR [rcx+-32] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-8], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-32], rsi + ; XOR in constant + mov rbx, 9223372036854808714 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx+16] + mov rdi, QWORD PTR [rcx+-48] + mov rsi, QWORD PTR [rcx+-72] + mov rbx, QWORD PTR [rcx+64] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-72], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+64], rbp + mov QWORD PTR [rcx+40], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-24] + mov r15, QWORD PTR [rcx+-88] + mov rdi, QWORD PTR [rcx+88] + mov rsi, QWORD PTR [rcx+24] + mov rbx, QWORD PTR [rcx] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+24], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx], rbp + mov QWORD PTR [rcx+-24], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+72] + mov r15, QWORD PTR [rcx+48] + mov rdi, QWORD PTR [rcx+-16] + mov rsi, QWORD PTR [rcx+-40] + mov rbx, QWORD PTR [rcx+-64] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-40], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-64], rbp + mov QWORD PTR [rcx+72], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+8] + xor r12, QWORD PTR [rcx+-56] + xor r13, QWORD PTR [rcx+-80] + xor rax, QWORD PTR [rcx+96] + xor r10, QWORD PTR [rcx+32] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+8], r14 + mov QWORD PTR [rcx+-56], r15 + mov QWORD PTR [rcx+-80], rdi + mov QWORD PTR [rcx+96], rsi + mov QWORD PTR [rcx+32], rbx + ; Round 3 + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rsi, QWORD PTR [rcx+-72] + xor rbx, QWORD PTR [rcx+-64] + xor rdi, QWORD PTR [rcx+-48] + xor rsi, QWORD PTR [rcx+-40] + xor rbx, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-24] + xor rdi, QWORD PTR [rcx+-16] + xor rsi, QWORD PTR [rcx+-8] + xor rbx, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+16] + xor rsi, QWORD PTR [rcx+24] + xor r14, QWORD PTR [rcx+40] + xor r15, QWORD PTR [rcx+48] + xor rdi, QWORD PTR [rcx+56] + xor rbx, QWORD PTR [rcx+64] + xor r14, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+80] + xor rdi, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+16] + mov rdi, QWORD PTR [rcx+88] + mov rsi, QWORD PTR [rcx+-40] + mov rbx, QWORD PTR [rcx+32] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-40], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+32], rsi + ; XOR in constant + mov rbx, 9223372039002292224 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+-8] + mov r15, QWORD PTR [rcx+64] + mov rdi, QWORD PTR [rcx+-24] + mov rsi, QWORD PTR [rcx+48] + mov rbx, QWORD PTR [rcx+-80] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+48], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-80], rbp + mov QWORD PTR [rcx+-8], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+80] + mov r15, QWORD PTR [rcx+-48] + mov rdi, QWORD PTR [rcx+24] + mov rsi, QWORD PTR [rcx+-64] + mov rbx, QWORD PTR [rcx+8] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-64], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+8], rbp + mov QWORD PTR [rcx+80], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-32] + mov r15, QWORD PTR [rcx+40] + mov rdi, QWORD PTR [rcx+-88] + mov rsi, QWORD PTR [rcx+-16] + mov rbx, QWORD PTR [rcx+96] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-16], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+96], rbp + mov QWORD PTR [rcx+-32], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+56] + xor r12, QWORD PTR [rcx+-72] + xor r13, QWORD PTR [rcx] + xor rax, QWORD PTR [rcx+72] + xor r10, QWORD PTR [rcx+-56] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+56], r14 + mov QWORD PTR [rcx+-72], r15 + mov QWORD PTR [rcx], rdi + mov QWORD PTR [rcx+72], rsi + mov QWORD PTR [rcx+-56], rbx + ; Round 4 + xor r14, r9 + xor rdi, QWORD PTR [rcx+-88] + xor rbx, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-48] + xor rsi, QWORD PTR [rcx+-40] + xor r14, QWORD PTR [rcx+-32] + xor rdi, QWORD PTR [rcx+-24] + xor rsi, QWORD PTR [rcx+-16] + xor r14, QWORD PTR [rcx+-8] + xor rbx, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+16] + xor rdi, QWORD PTR [rcx+24] + xor rbx, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+40] + xor rsi, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+64] + xor r14, QWORD PTR [rcx+80] + xor rdi, QWORD PTR [rcx+88] + xor rbx, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+64] + mov rdi, QWORD PTR [rcx+24] + mov rsi, QWORD PTR [rcx+-16] + mov rbx, QWORD PTR [rcx+-56] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-16], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-56], rsi + ; XOR in constant + xor r9, 32907 + ; Row 1 + mov r14, QWORD PTR [rcx+-40] + mov r15, QWORD PTR [rcx+-80] + mov rdi, QWORD PTR [rcx+80] + mov rsi, QWORD PTR [rcx+40] + mov rbx, QWORD PTR [rcx] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+40], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx], rbp + mov QWORD PTR [rcx+-40], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+-24] + mov rdi, QWORD PTR [rcx+-64] + mov rsi, QWORD PTR [rcx+96] + mov rbx, QWORD PTR [rcx+56] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+96], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+56], rbp + mov QWORD PTR [rcx+16], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+32] + mov r15, QWORD PTR [rcx+-8] + mov rdi, QWORD PTR [rcx+-48] + mov rsi, QWORD PTR [rcx+-88] + mov rbx, QWORD PTR [rcx+72] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-88], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+72], rbp + mov QWORD PTR [rcx+32], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+88] + xor r12, QWORD PTR [rcx+48] + xor r13, QWORD PTR [rcx+8] + xor rax, QWORD PTR [rcx+-32] + xor r10, QWORD PTR [rcx+-72] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+88], r14 + mov QWORD PTR [rcx+48], r15 + mov QWORD PTR [rcx+8], rdi + mov QWORD PTR [rcx+-32], rsi + mov QWORD PTR [rcx+-72], rbx + ; Round 5 + xor r14, r9 + xor rsi, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-80] + xor rdi, QWORD PTR [rcx+-64] + xor rbx, QWORD PTR [rcx+-56] + xor rdi, QWORD PTR [rcx+-48] + xor r14, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-24] + xor rsi, QWORD PTR [rcx+-16] + xor r15, QWORD PTR [rcx+-8] + xor rbx, QWORD PTR [rcx] + xor r14, QWORD PTR [rcx+16] + xor rdi, QWORD PTR [rcx+24] + xor r14, QWORD PTR [rcx+32] + xor rsi, QWORD PTR [rcx+40] + xor rbx, QWORD PTR [rcx+56] + xor r15, QWORD PTR [rcx+64] + xor rbx, QWORD PTR [rcx+72] + xor rdi, QWORD PTR [rcx+80] + xor rsi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-80] + mov rdi, QWORD PTR [rcx+-64] + mov rsi, QWORD PTR [rcx+-88] + mov rbx, QWORD PTR [rcx+-72] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-88], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-72], rsi + ; XOR in constant + mov rbx, 2147483649 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+-16] + mov r15, QWORD PTR [rcx] + mov rdi, QWORD PTR [rcx+16] + mov rsi, QWORD PTR [rcx+-8] + mov rbx, QWORD PTR [rcx+8] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-8], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+8], rbp + mov QWORD PTR [rcx+-16], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+64] + mov r15, QWORD PTR [rcx+80] + mov rdi, QWORD PTR [rcx+96] + mov rsi, QWORD PTR [rcx+72] + mov rbx, QWORD PTR [rcx+88] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+96], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+72], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+88], rbp + mov QWORD PTR [rcx+64], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-56] + mov r15, QWORD PTR [rcx+-40] + mov rdi, QWORD PTR [rcx+-24] + mov rsi, QWORD PTR [rcx+-48] + mov rbx, QWORD PTR [rcx+-32] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-48], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-32], rbp + mov QWORD PTR [rcx+-56], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+24] + xor r12, QWORD PTR [rcx+40] + xor r13, QWORD PTR [rcx+56] + xor rax, QWORD PTR [rcx+32] + xor r10, QWORD PTR [rcx+48] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+24], r14 + mov QWORD PTR [rcx+40], r15 + mov QWORD PTR [rcx+56], rdi + mov QWORD PTR [rcx+32], rsi + mov QWORD PTR [rcx+48], rbx + ; Round 6 + xor r14, r9 + xor rsi, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-80] + xor rbx, QWORD PTR [rcx+-72] + xor rdi, QWORD PTR [rcx+-64] + xor r14, QWORD PTR [rcx+-56] + xor rsi, QWORD PTR [rcx+-48] + xor r15, QWORD PTR [rcx+-40] + xor rbx, QWORD PTR [rcx+-32] + xor rdi, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+-16] + xor rsi, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx] + xor rbx, QWORD PTR [rcx+8] + xor rdi, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+64] + xor rsi, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+80] + xor rbx, QWORD PTR [rcx+88] + xor rdi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx] + mov rdi, QWORD PTR [rcx+96] + mov rsi, QWORD PTR [rcx+-48] + mov rbx, QWORD PTR [rcx+48] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+96], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-48], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+48], rsi + ; XOR in constant + mov rbx, 9223372039002292353 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+-88] + mov r15, QWORD PTR [rcx+8] + mov rdi, QWORD PTR [rcx+64] + mov rsi, QWORD PTR [rcx+-40] + mov rbx, QWORD PTR [rcx+56] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-40], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+56], rbp + mov QWORD PTR [rcx+-88], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-80] + mov r15, QWORD PTR [rcx+16] + mov rdi, QWORD PTR [rcx+72] + mov rsi, QWORD PTR [rcx+-32] + mov rbx, QWORD PTR [rcx+24] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-32], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+24], rbp + mov QWORD PTR [rcx+-80], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+-16] + mov rdi, QWORD PTR [rcx+80] + mov rsi, QWORD PTR [rcx+-24] + mov rbx, QWORD PTR [rcx+32] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-24], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+32], rbp + mov QWORD PTR [rcx+-72], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-64] + xor r12, QWORD PTR [rcx+-8] + xor r13, QWORD PTR [rcx+88] + xor rax, QWORD PTR [rcx+-56] + xor r10, QWORD PTR [rcx+40] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-64], r14 + mov QWORD PTR [rcx+-8], r15 + mov QWORD PTR [rcx+88], rdi + mov QWORD PTR [rcx+-56], rsi + mov QWORD PTR [rcx+40], rbx + ; Round 7 + xor r14, r9 + xor r14, QWORD PTR [rcx+-88] + xor r14, QWORD PTR [rcx+-80] + xor r14, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-48] + xor rsi, QWORD PTR [rcx+-40] + xor rsi, QWORD PTR [rcx+-32] + xor rsi, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-16] + xor r15, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+16] + xor rbx, QWORD PTR [rcx+24] + xor rbx, QWORD PTR [rcx+32] + xor rbx, QWORD PTR [rcx+48] + xor rbx, QWORD PTR [rcx+56] + xor rdi, QWORD PTR [rcx+64] + xor rdi, QWORD PTR [rcx+72] + xor rdi, QWORD PTR [rcx+80] + xor rdi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+8] + mov rdi, QWORD PTR [rcx+72] + mov rsi, QWORD PTR [rcx+-24] + mov rbx, QWORD PTR [rcx+40] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-24], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+40], rsi + ; XOR in constant + mov rbx, 9223372036854808585 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+-48] + mov r15, QWORD PTR [rcx+56] + mov rdi, QWORD PTR [rcx+-80] + mov rsi, QWORD PTR [rcx+-16] + mov rbx, QWORD PTR [rcx+88] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-16], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+88], rbp + mov QWORD PTR [rcx+-48], rsi + ; Row 2 + mov r14, QWORD PTR [rcx] + mov r15, QWORD PTR [rcx+64] + mov rdi, QWORD PTR [rcx+-32] + mov rsi, QWORD PTR [rcx+32] + mov rbx, QWORD PTR [rcx+-64] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+32], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-64], rbp + mov QWORD PTR [rcx], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+48] + mov r15, QWORD PTR [rcx+-88] + mov rdi, QWORD PTR [rcx+16] + mov rsi, QWORD PTR [rcx+80] + mov rbx, QWORD PTR [rcx+-56] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+80], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-56], rbp + mov QWORD PTR [rcx+48], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+96] + xor r12, QWORD PTR [rcx+-40] + xor r13, QWORD PTR [rcx+24] + xor rax, QWORD PTR [rcx+-72] + xor r10, QWORD PTR [rcx+-8] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+96], r14 + mov QWORD PTR [rcx+-40], r15 + mov QWORD PTR [rcx+24], rdi + mov QWORD PTR [rcx+-72], rsi + mov QWORD PTR [rcx+-8], rbx + ; Round 8 + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-80] + xor rbx, QWORD PTR [rcx+-64] + xor rbx, QWORD PTR [rcx+-56] + xor r14, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-32] + xor rsi, QWORD PTR [rcx+-24] + xor rsi, QWORD PTR [rcx+-16] + xor r14, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+8] + xor rdi, QWORD PTR [rcx+16] + xor rsi, QWORD PTR [rcx+32] + xor rbx, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+56] + xor r15, QWORD PTR [rcx+64] + xor rdi, QWORD PTR [rcx+72] + xor rsi, QWORD PTR [rcx+80] + xor rbx, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+56] + mov rdi, QWORD PTR [rcx+-32] + mov rsi, QWORD PTR [rcx+80] + mov rbx, QWORD PTR [rcx+-8] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+80], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-8], rsi + ; XOR in constant + xor r9, 138 + ; Row 1 + mov r14, QWORD PTR [rcx+-24] + mov r15, QWORD PTR [rcx+88] + mov rdi, QWORD PTR [rcx] + mov rsi, QWORD PTR [rcx+-88] + mov rbx, QWORD PTR [rcx+24] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-88], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+24], rbp + mov QWORD PTR [rcx+-24], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+8] + mov r15, QWORD PTR [rcx+-80] + mov rdi, QWORD PTR [rcx+32] + mov rsi, QWORD PTR [rcx+-56] + mov rbx, QWORD PTR [rcx+96] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-56], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+96], rbp + mov QWORD PTR [rcx+8], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx+-48] + mov rdi, QWORD PTR [rcx+64] + mov rsi, QWORD PTR [rcx+16] + mov rbx, QWORD PTR [rcx+-72] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+16], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-72], rbp + mov QWORD PTR [rcx+40], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+72] + xor r12, QWORD PTR [rcx+-16] + xor r13, QWORD PTR [rcx+-64] + xor rax, QWORD PTR [rcx+48] + xor r10, QWORD PTR [rcx+-40] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+72], r14 + mov QWORD PTR [rcx+-16], r15 + mov QWORD PTR [rcx+-64], rdi + mov QWORD PTR [rcx+48], rsi + mov QWORD PTR [rcx+-40], rbx + ; Round 9 + xor r14, r9 + xor rsi, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-80] + xor rbx, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-24] + xor rbx, QWORD PTR [rcx+-8] + xor rdi, QWORD PTR [rcx] + xor r14, QWORD PTR [rcx+8] + xor rsi, QWORD PTR [rcx+16] + xor rbx, QWORD PTR [rcx+24] + xor rdi, QWORD PTR [rcx+32] + xor r14, QWORD PTR [rcx+40] + xor r15, QWORD PTR [rcx+56] + xor rdi, QWORD PTR [rcx+64] + xor rsi, QWORD PTR [rcx+80] + xor r15, QWORD PTR [rcx+88] + xor rbx, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+88] + mov rdi, QWORD PTR [rcx+32] + mov rsi, QWORD PTR [rcx+16] + mov rbx, QWORD PTR [rcx+-40] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+16], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-40], rsi + ; XOR in constant + xor r9, 136 + ; Row 1 + mov r14, QWORD PTR [rcx+80] + mov r15, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rcx+-48] + mov rbx, QWORD PTR [rcx+-64] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-48], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-64], rbp + mov QWORD PTR [rcx+80], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+56] + mov r15, QWORD PTR [rcx] + mov rdi, QWORD PTR [rcx+-56] + mov rsi, QWORD PTR [rcx+-72] + mov rbx, QWORD PTR [rcx+72] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-72], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+72], rbp + mov QWORD PTR [rcx+56], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-8] + mov r15, QWORD PTR [rcx+-24] + mov rdi, QWORD PTR [rcx+-80] + mov rsi, QWORD PTR [rcx+64] + mov rbx, QWORD PTR [rcx+48] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+64], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+48], rbp + mov QWORD PTR [rcx+-8], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-32] + xor r12, QWORD PTR [rcx+-88] + xor r13, QWORD PTR [rcx+96] + xor rax, QWORD PTR [rcx+40] + xor r10, QWORD PTR [rcx+-16] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-32], r14 + mov QWORD PTR [rcx+-88], r15 + mov QWORD PTR [rcx+96], rdi + mov QWORD PTR [rcx+40], rsi + mov QWORD PTR [rcx+-16], rbx + ; Round 10 + xor r14, r9 + xor rdi, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-72] + xor rbx, QWORD PTR [rcx+-64] + xor rdi, QWORD PTR [rcx+-56] + xor rsi, QWORD PTR [rcx+-48] + xor rbx, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx] + xor rdi, QWORD PTR [rcx+8] + xor rsi, QWORD PTR [rcx+16] + xor r15, QWORD PTR [rcx+24] + xor rdi, QWORD PTR [rcx+32] + xor rbx, QWORD PTR [rcx+48] + xor r14, QWORD PTR [rcx+56] + xor rsi, QWORD PTR [rcx+64] + xor rbx, QWORD PTR [rcx+72] + xor r14, QWORD PTR [rcx+80] + xor r15, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [rcx+-56] + mov rsi, QWORD PTR [rcx+64] + mov rbx, QWORD PTR [rcx+-16] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+64], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-16], rsi + ; XOR in constant + mov rbx, 2147516425 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+-64] + mov rdi, QWORD PTR [rcx+56] + mov rsi, QWORD PTR [rcx+-24] + mov rbx, QWORD PTR [rcx+96] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-24], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+96], rbp + mov QWORD PTR [rcx+16], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+88] + mov r15, QWORD PTR [rcx+8] + mov rdi, QWORD PTR [rcx+-72] + mov rsi, QWORD PTR [rcx+48] + mov rbx, QWORD PTR [rcx+-32] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+48], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-32], rbp + mov QWORD PTR [rcx+88], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-40] + mov r15, QWORD PTR [rcx+80] + mov rdi, QWORD PTR [rcx] + mov rsi, QWORD PTR [rcx+-80] + mov rbx, QWORD PTR [rcx+40] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-80], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+40], rbp + mov QWORD PTR [rcx+-40], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+32] + xor r12, QWORD PTR [rcx+-48] + xor r13, QWORD PTR [rcx+72] + xor rax, QWORD PTR [rcx+-8] + xor r10, QWORD PTR [rcx+-88] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+32], r14 + mov QWORD PTR [rcx+-48], r15 + mov QWORD PTR [rcx+72], rdi + mov QWORD PTR [rcx+-8], rsi + mov QWORD PTR [rcx+-88], rbx + ; Round 11 + xor r14, r9 + xor rsi, QWORD PTR [rcx+-80] + xor rdi, QWORD PTR [rcx+-72] + xor r15, QWORD PTR [rcx+-64] + xor rdi, QWORD PTR [rcx+-56] + xor r14, QWORD PTR [rcx+-40] + xor rbx, QWORD PTR [rcx+-32] + xor rsi, QWORD PTR [rcx+-24] + xor rbx, QWORD PTR [rcx+-16] + xor rdi, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+8] + xor r14, QWORD PTR [rcx+16] + xor r15, QWORD PTR [rcx+24] + xor rbx, QWORD PTR [rcx+40] + xor rsi, QWORD PTR [rcx+48] + xor rdi, QWORD PTR [rcx+56] + xor rsi, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+88] + xor rbx, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-64] + mov rdi, QWORD PTR [rcx+-72] + mov rsi, QWORD PTR [rcx+-80] + mov rbx, QWORD PTR [rcx+-88] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-80], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-88], rsi + ; XOR in constant + mov rbx, 2147483658 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+64] + mov r15, QWORD PTR [rcx+96] + mov rdi, QWORD PTR [rcx+88] + mov rsi, QWORD PTR [rcx+80] + mov rbx, QWORD PTR [rcx+72] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+96], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+80], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+72], rbp + mov QWORD PTR [rcx+64], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+24] + mov r15, QWORD PTR [rcx+56] + mov rdi, QWORD PTR [rcx+48] + mov rsi, QWORD PTR [rcx+40] + mov rbx, QWORD PTR [rcx+32] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+40], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+32], rbp + mov QWORD PTR [rcx+24], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-16] + mov r15, QWORD PTR [rcx+16] + mov rdi, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rcx] + mov rbx, QWORD PTR [rcx+-8] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-8], rbp + mov QWORD PTR [rcx+-16], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-56] + xor r12, QWORD PTR [rcx+-24] + xor r13, QWORD PTR [rcx+-32] + xor rax, QWORD PTR [rcx+-40] + xor r10, QWORD PTR [rcx+-48] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-56], r14 + mov QWORD PTR [rcx+-24], r15 + mov QWORD PTR [rcx+-32], rdi + mov QWORD PTR [rcx+-40], rsi + mov QWORD PTR [rcx+-48], rbx + ; Round 12 + xor r14, r9 + xor rbx, QWORD PTR [rcx+-88] + xor rsi, QWORD PTR [rcx+-80] + xor rdi, QWORD PTR [rcx+-72] + xor r15, QWORD PTR [rcx+-64] + xor r14, QWORD PTR [rcx+-16] + xor rbx, QWORD PTR [rcx+-8] + xor rsi, QWORD PTR [rcx] + xor rdi, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+16] + xor r14, QWORD PTR [rcx+24] + xor rbx, QWORD PTR [rcx+32] + xor rsi, QWORD PTR [rcx+40] + xor rdi, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+64] + xor rbx, QWORD PTR [rcx+72] + xor rsi, QWORD PTR [rcx+80] + xor rdi, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+96] + mov rdi, QWORD PTR [rcx+48] + mov rsi, QWORD PTR [rcx] + mov rbx, QWORD PTR [rcx+-48] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+96], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-48], rsi + ; XOR in constant + mov rbx, 2147516555 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+-80] + mov r15, QWORD PTR [rcx+72] + mov rdi, QWORD PTR [rcx+24] + mov rsi, QWORD PTR [rcx+16] + mov rbx, QWORD PTR [rcx+-32] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+16], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-32], rbp + mov QWORD PTR [rcx+-80], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-64] + mov r15, QWORD PTR [rcx+88] + mov rdi, QWORD PTR [rcx+40] + mov rsi, QWORD PTR [rcx+-8] + mov rbx, QWORD PTR [rcx+-56] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-8], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-56], rbp + mov QWORD PTR [rcx+-64], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-88] + mov r15, QWORD PTR [rcx+64] + mov rdi, QWORD PTR [rcx+56] + mov rsi, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+-40] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+8], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-40], rbp + mov QWORD PTR [rcx+-88], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-72] + xor r12, QWORD PTR [rcx+80] + xor r13, QWORD PTR [rcx+32] + xor rax, QWORD PTR [rcx+-16] + xor r10, QWORD PTR [rcx+-24] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-72], r14 + mov QWORD PTR [rcx+80], r15 + mov QWORD PTR [rcx+32], rdi + mov QWORD PTR [rcx+-16], rsi + mov QWORD PTR [rcx+-24], rbx + ; Round 13 + xor r14, r9 + xor r14, QWORD PTR [rcx+-88] + xor r14, QWORD PTR [rcx+-80] + xor r14, QWORD PTR [rcx+-64] + xor rbx, QWORD PTR [rcx+-56] + xor rbx, QWORD PTR [rcx+-48] + xor rbx, QWORD PTR [rcx+-40] + xor rbx, QWORD PTR [rcx+-32] + xor rsi, QWORD PTR [rcx+-8] + xor rsi, QWORD PTR [rcx] + xor rsi, QWORD PTR [rcx+8] + xor rsi, QWORD PTR [rcx+16] + xor rdi, QWORD PTR [rcx+24] + xor rdi, QWORD PTR [rcx+40] + xor rdi, QWORD PTR [rcx+48] + xor rdi, QWORD PTR [rcx+56] + xor r15, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor r15, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+72] + mov rdi, QWORD PTR [rcx+40] + mov rsi, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+-24] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+8], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-24], rsi + ; XOR in constant + mov rbx, 9223372036854775947 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx] + mov r15, QWORD PTR [rcx+-32] + mov rdi, QWORD PTR [rcx+-64] + mov rsi, QWORD PTR [rcx+64] + mov rbx, QWORD PTR [rcx+32] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+64], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+32], rbp + mov QWORD PTR [rcx], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+96] + mov r15, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [rcx+-8] + mov rsi, QWORD PTR [rcx+-40] + mov rbx, QWORD PTR [rcx+-72] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-40], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-72], rbp + mov QWORD PTR [rcx+96], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-48] + mov r15, QWORD PTR [rcx+-80] + mov rdi, QWORD PTR [rcx+88] + mov rsi, QWORD PTR [rcx+56] + mov rbx, QWORD PTR [rcx+-16] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-80], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+56], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-16], rbp + mov QWORD PTR [rcx+-48], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+48] + xor r12, QWORD PTR [rcx+16] + xor r13, QWORD PTR [rcx+-56] + xor rax, QWORD PTR [rcx+-88] + xor r10, QWORD PTR [rcx+80] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+48], r14 + mov QWORD PTR [rcx+16], r15 + mov QWORD PTR [rcx+-56], rdi + mov QWORD PTR [rcx+-88], rsi + mov QWORD PTR [rcx+80], rbx + ; Round 14 + xor r14, r9 + xor r15, QWORD PTR [rcx+-80] + xor rbx, QWORD PTR [rcx+-72] + xor rdi, QWORD PTR [rcx+-64] + xor r14, QWORD PTR [rcx+-48] + xor rsi, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor rbx, QWORD PTR [rcx+-24] + xor rbx, QWORD PTR [rcx+-16] + xor rdi, QWORD PTR [rcx+-8] + xor r14, QWORD PTR [rcx] + xor rsi, QWORD PTR [rcx+8] + xor r15, QWORD PTR [rcx+24] + xor rbx, QWORD PTR [rcx+32] + xor rdi, QWORD PTR [rcx+40] + xor rsi, QWORD PTR [rcx+56] + xor rsi, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor rdi, QWORD PTR [rcx+88] + xor r14, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-32] + mov rdi, QWORD PTR [rcx+-8] + mov rsi, QWORD PTR [rcx+56] + mov rbx, QWORD PTR [rcx+80] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+56], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+80], rsi + ; XOR in constant + mov rbx, 9223372036854808713 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+8] + mov r15, QWORD PTR [rcx+32] + mov rdi, QWORD PTR [rcx+96] + mov rsi, QWORD PTR [rcx+-80] + mov rbx, QWORD PTR [rcx+-56] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+96], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-80], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-56], rbp + mov QWORD PTR [rcx+8], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+72] + mov r15, QWORD PTR [rcx+-64] + mov rdi, QWORD PTR [rcx+-40] + mov rsi, QWORD PTR [rcx+-16] + mov rbx, QWORD PTR [rcx+48] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-16], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+48], rbp + mov QWORD PTR [rcx+72], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-24] + mov r15, QWORD PTR [rcx] + mov rdi, QWORD PTR [rcx+24] + mov rsi, QWORD PTR [rcx+88] + mov rbx, QWORD PTR [rcx+-88] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+88], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-88], rbp + mov QWORD PTR [rcx+-24], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+40] + xor r12, QWORD PTR [rcx+64] + xor r13, QWORD PTR [rcx+-72] + xor rax, QWORD PTR [rcx+-48] + xor r10, QWORD PTR [rcx+16] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+40], r14 + mov QWORD PTR [rcx+64], r15 + mov QWORD PTR [rcx+-72], rdi + mov QWORD PTR [rcx+-48], rsi + mov QWORD PTR [rcx+16], rbx + ; Round 15 + xor r14, r9 + xor rbx, QWORD PTR [rcx+-88] + xor rsi, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-64] + xor rbx, QWORD PTR [rcx+-56] + xor rdi, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor r14, QWORD PTR [rcx+-24] + xor rsi, QWORD PTR [rcx+-16] + xor rdi, QWORD PTR [rcx+-8] + xor r15, QWORD PTR [rcx] + xor r14, QWORD PTR [rcx+8] + xor rdi, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor rbx, QWORD PTR [rcx+48] + xor rsi, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+72] + xor rbx, QWORD PTR [rcx+80] + xor rsi, QWORD PTR [rcx+88] + xor rdi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+32] + mov rdi, QWORD PTR [rcx+-40] + mov rsi, QWORD PTR [rcx+88] + mov rbx, QWORD PTR [rcx+16] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+88], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+16], rsi + ; XOR in constant + mov rbx, 9223372036854808579 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+56] + mov r15, QWORD PTR [rcx+-56] + mov rdi, QWORD PTR [rcx+72] + mov rsi, QWORD PTR [rcx] + mov rbx, QWORD PTR [rcx+-72] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-72], rbp + mov QWORD PTR [rcx+56], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-32] + mov r15, QWORD PTR [rcx+96] + mov rdi, QWORD PTR [rcx+-16] + mov rsi, QWORD PTR [rcx+-88] + mov rbx, QWORD PTR [rcx+40] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+96], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-88], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+40], rbp + mov QWORD PTR [rcx+-32], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+80] + mov r15, QWORD PTR [rcx+8] + mov rdi, QWORD PTR [rcx+-64] + mov rsi, QWORD PTR [rcx+24] + mov rbx, QWORD PTR [rcx+-48] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+24], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-48], rbp + mov QWORD PTR [rcx+80], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-8] + xor r12, QWORD PTR [rcx+-80] + xor r13, QWORD PTR [rcx+48] + xor rax, QWORD PTR [rcx+-24] + xor r10, QWORD PTR [rcx+64] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-8], r14 + mov QWORD PTR [rcx+-80], r15 + mov QWORD PTR [rcx+48], rdi + mov QWORD PTR [rcx+-24], rsi + mov QWORD PTR [rcx+64], rbx + ; Round 16 + xor r14, r9 + xor rsi, QWORD PTR [rcx+-88] + xor rbx, QWORD PTR [rcx+-72] + xor rdi, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor rbx, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-40] + xor r14, QWORD PTR [rcx+-32] + xor rdi, QWORD PTR [rcx+-16] + xor rsi, QWORD PTR [rcx] + xor r15, QWORD PTR [rcx+8] + xor rbx, QWORD PTR [rcx+16] + xor rsi, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor rbx, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+56] + xor rdi, QWORD PTR [rcx+72] + xor r14, QWORD PTR [rcx+80] + xor rsi, QWORD PTR [rcx+88] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-56] + mov rdi, QWORD PTR [rcx+-16] + mov rsi, QWORD PTR [rcx+24] + mov rbx, QWORD PTR [rcx+64] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+24], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+64], rsi + ; XOR in constant + mov rbx, 9223372036854808578 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+88] + mov r15, QWORD PTR [rcx+-72] + mov rdi, QWORD PTR [rcx+-32] + mov rsi, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+48] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+8], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+48], rbp + mov QWORD PTR [rcx+88], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+32] + mov r15, QWORD PTR [rcx+72] + mov rdi, QWORD PTR [rcx+-88] + mov rsi, QWORD PTR [rcx+-48] + mov rbx, QWORD PTR [rcx+-8] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-48], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-8], rbp + mov QWORD PTR [rcx+32], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+16] + mov r15, QWORD PTR [rcx+56] + mov rdi, QWORD PTR [rcx+96] + mov rsi, QWORD PTR [rcx+-64] + mov rbx, QWORD PTR [rcx+-24] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+96], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-64], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-24], rbp + mov QWORD PTR [rcx+16], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-40] + xor r12, QWORD PTR [rcx] + xor r13, QWORD PTR [rcx+40] + xor rax, QWORD PTR [rcx+80] + xor r10, QWORD PTR [rcx+-80] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-40], r14 + mov QWORD PTR [rcx], r15 + mov QWORD PTR [rcx+40], rdi + mov QWORD PTR [rcx+80], rsi + mov QWORD PTR [rcx+-80], rbx + ; Round 17 + xor r14, r9 + xor rdi, QWORD PTR [rcx+-88] + xor r15, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor rsi, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-32] + xor rbx, QWORD PTR [rcx+-24] + xor rdi, QWORD PTR [rcx+-16] + xor rbx, QWORD PTR [rcx+-8] + xor rsi, QWORD PTR [rcx+8] + xor r14, QWORD PTR [rcx+16] + xor rsi, QWORD PTR [rcx+24] + xor r14, QWORD PTR [rcx+32] + xor rbx, QWORD PTR [rcx+48] + xor r15, QWORD PTR [rcx+56] + xor rbx, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor r14, QWORD PTR [rcx+88] + xor rdi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-72] + mov rdi, QWORD PTR [rcx+-88] + mov rsi, QWORD PTR [rcx+-64] + mov rbx, QWORD PTR [rcx+-80] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-88], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-64], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-80], rsi + ; XOR in constant + mov rbx, 9223372036854775936 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+24] + mov r15, QWORD PTR [rcx+48] + mov rdi, QWORD PTR [rcx+32] + mov rsi, QWORD PTR [rcx+56] + mov rbx, QWORD PTR [rcx+40] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+56], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+40], rbp + mov QWORD PTR [rcx+24], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-56] + mov r15, QWORD PTR [rcx+-32] + mov rdi, QWORD PTR [rcx+-48] + mov rsi, QWORD PTR [rcx+-24] + mov rbx, QWORD PTR [rcx+-40] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-24], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-40], rbp + mov QWORD PTR [rcx+-56], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+64] + mov r15, QWORD PTR [rcx+88] + mov rdi, QWORD PTR [rcx+72] + mov rsi, QWORD PTR [rcx+96] + mov rbx, QWORD PTR [rcx+80] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+96], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+80], rbp + mov QWORD PTR [rcx+64], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-16] + xor r12, QWORD PTR [rcx+8] + xor r13, QWORD PTR [rcx+-8] + xor rax, QWORD PTR [rcx+16] + xor r10, QWORD PTR [rcx] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-16], r14 + mov QWORD PTR [rcx+8], r15 + mov QWORD PTR [rcx+-8], rdi + mov QWORD PTR [rcx+16], rsi + mov QWORD PTR [rcx], rbx + ; Round 18 + xor r14, r9 + xor rdi, QWORD PTR [rcx+-88] + xor rbx, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-64] + xor r14, QWORD PTR [rcx+-56] + xor rdi, QWORD PTR [rcx+-48] + xor rbx, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor rsi, QWORD PTR [rcx+-24] + xor r14, QWORD PTR [rcx+24] + xor rdi, QWORD PTR [rcx+32] + xor rbx, QWORD PTR [rcx+40] + xor r15, QWORD PTR [rcx+48] + xor rsi, QWORD PTR [rcx+56] + xor r14, QWORD PTR [rcx+64] + xor rdi, QWORD PTR [rcx+72] + xor rbx, QWORD PTR [rcx+80] + xor r15, QWORD PTR [rcx+88] + xor rsi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+48] + mov rdi, QWORD PTR [rcx+-48] + mov rsi, QWORD PTR [rcx+96] + mov rbx, QWORD PTR [rcx] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+96], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx], rsi + ; XOR in constant + xor r9, 32778 + ; Row 1 + mov r14, QWORD PTR [rcx+-64] + mov r15, QWORD PTR [rcx+40] + mov rdi, QWORD PTR [rcx+-56] + mov rsi, QWORD PTR [rcx+88] + mov rbx, QWORD PTR [rcx+-8] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+88], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-8], rbp + mov QWORD PTR [rcx+-64], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-72] + mov r15, QWORD PTR [rcx+32] + mov rdi, QWORD PTR [rcx+-24] + mov rsi, QWORD PTR [rcx+80] + mov rbx, QWORD PTR [rcx+-16] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+80], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-16], rbp + mov QWORD PTR [rcx+-72], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+-80] + mov r15, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [rcx+-32] + mov rsi, QWORD PTR [rcx+72] + mov rbx, QWORD PTR [rcx+16] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+24], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+72], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+16], rbp + mov QWORD PTR [rcx+-80], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-88] + xor r12, QWORD PTR [rcx+56] + xor r13, QWORD PTR [rcx+-40] + xor rax, QWORD PTR [rcx+64] + xor r10, QWORD PTR [rcx+8] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-88], r14 + mov QWORD PTR [rcx+56], r15 + mov QWORD PTR [rcx+-40], rdi + mov QWORD PTR [rcx+64], rsi + mov QWORD PTR [rcx+8], rbx + ; Round 19 + xor r14, r9 + xor r14, QWORD PTR [rcx+-80] + xor r14, QWORD PTR [rcx+-72] + xor r14, QWORD PTR [rcx+-64] + xor rdi, QWORD PTR [rcx+-56] + xor rdi, QWORD PTR [rcx+-48] + xor rdi, QWORD PTR [rcx+-32] + xor rdi, QWORD PTR [rcx+-24] + xor rbx, QWORD PTR [rcx+-16] + xor rbx, QWORD PTR [rcx+-8] + xor rbx, QWORD PTR [rcx] + xor rbx, QWORD PTR [rcx+16] + xor r15, QWORD PTR [rcx+24] + xor r15, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+40] + xor r15, QWORD PTR [rcx+48] + xor rsi, QWORD PTR [rcx+72] + xor rsi, QWORD PTR [rcx+80] + xor rsi, QWORD PTR [rcx+88] + xor rsi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+40] + mov rdi, QWORD PTR [rcx+-24] + mov rsi, QWORD PTR [rcx+72] + mov rbx, QWORD PTR [rcx+8] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-24], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+72], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+8], rsi + ; XOR in constant + mov rbx, 9223372039002259466 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+96] + mov r15, QWORD PTR [rcx+-8] + mov rdi, QWORD PTR [rcx+-72] + mov rsi, QWORD PTR [rcx+24] + mov rbx, QWORD PTR [rcx+-40] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+24], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-40], rbp + mov QWORD PTR [rcx+96], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+48] + mov r15, QWORD PTR [rcx+-56] + mov rdi, QWORD PTR [rcx+80] + mov rsi, QWORD PTR [rcx+16] + mov rbx, QWORD PTR [rcx+-88] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-56], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+16], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-88], rbp + mov QWORD PTR [rcx+48], rsi + ; Row 3 + mov r14, QWORD PTR [rcx] + mov r15, QWORD PTR [rcx+-64] + mov rdi, QWORD PTR [rcx+32] + mov rsi, QWORD PTR [rcx+-32] + mov rbx, QWORD PTR [rcx+64] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-64], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+32], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-32], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+64], rbp + mov QWORD PTR [rcx], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-48] + xor r12, QWORD PTR [rcx+88] + xor r13, QWORD PTR [rcx+-16] + xor rax, QWORD PTR [rcx+-80] + xor r10, QWORD PTR [rcx+56] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-48], r14 + mov QWORD PTR [rcx+88], r15 + mov QWORD PTR [rcx+-16], rdi + mov QWORD PTR [rcx+-80], rsi + mov QWORD PTR [rcx+56], rbx + ; Round 20 + xor r14, r9 + xor rbx, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-72] + xor r15, QWORD PTR [rcx+-64] + xor r15, QWORD PTR [rcx+-56] + xor rbx, QWORD PTR [rcx+-40] + xor rsi, QWORD PTR [rcx+-32] + xor rdi, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-8] + xor r14, QWORD PTR [rcx] + xor rbx, QWORD PTR [rcx+8] + xor rsi, QWORD PTR [rcx+16] + xor rsi, QWORD PTR [rcx+24] + xor rdi, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+40] + xor r14, QWORD PTR [rcx+48] + xor rbx, QWORD PTR [rcx+64] + xor rsi, QWORD PTR [rcx+72] + xor rdi, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-8] + mov rdi, QWORD PTR [rcx+80] + mov rsi, QWORD PTR [rcx+-32] + mov rbx, QWORD PTR [rcx+56] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-32], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+56], rsi + ; XOR in constant + mov rbx, 9223372039002292353 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+72] + mov r15, QWORD PTR [rcx+-40] + mov rdi, QWORD PTR [rcx+48] + mov rsi, QWORD PTR [rcx+-64] + mov rbx, QWORD PTR [rcx+-16] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-64], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-16], rbp + mov QWORD PTR [rcx+72], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+40] + mov r15, QWORD PTR [rcx+-72] + mov rdi, QWORD PTR [rcx+16] + mov rsi, QWORD PTR [rcx+64] + mov rbx, QWORD PTR [rcx+-48] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+64], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-48], rbp + mov QWORD PTR [rcx+40], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+8] + mov r15, QWORD PTR [rcx+96] + mov rdi, QWORD PTR [rcx+-56] + mov rsi, QWORD PTR [rcx+32] + mov rbx, QWORD PTR [rcx+-80] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+96], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-56], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+32], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-80], rbp + mov QWORD PTR [rcx+8], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+-24] + xor r12, QWORD PTR [rcx+24] + xor r13, QWORD PTR [rcx+-88] + xor rax, QWORD PTR [rcx] + xor r10, QWORD PTR [rcx+88] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+-24], r14 + mov QWORD PTR [rcx+24], r15 + mov QWORD PTR [rcx+-88], rdi + mov QWORD PTR [rcx], rsi + mov QWORD PTR [rcx+88], rbx + ; Round 21 + xor r14, r9 + xor rbx, QWORD PTR [rcx+-80] + xor r15, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-64] + xor rdi, QWORD PTR [rcx+-56] + xor rbx, QWORD PTR [rcx+-48] + xor r15, QWORD PTR [rcx+-40] + xor rsi, QWORD PTR [rcx+-32] + xor rbx, QWORD PTR [rcx+-16] + xor r15, QWORD PTR [rcx+-8] + xor r14, QWORD PTR [rcx+8] + xor rdi, QWORD PTR [rcx+16] + xor rsi, QWORD PTR [rcx+32] + xor r14, QWORD PTR [rcx+40] + xor rdi, QWORD PTR [rcx+48] + xor rbx, QWORD PTR [rcx+56] + xor rsi, QWORD PTR [rcx+64] + xor r14, QWORD PTR [rcx+72] + xor rdi, QWORD PTR [rcx+80] + xor r15, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-40] + mov rdi, QWORD PTR [rcx+16] + mov rsi, QWORD PTR [rcx+32] + mov rbx, QWORD PTR [rcx+88] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+16], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+32], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+88], rsi + ; XOR in constant + mov rbx, 9223372036854808704 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+-32] + mov r15, QWORD PTR [rcx+-16] + mov rdi, QWORD PTR [rcx+40] + mov rsi, QWORD PTR [rcx+96] + mov rbx, QWORD PTR [rcx+-88] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+96], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-88], rbp + mov QWORD PTR [rcx+-32], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-8] + mov r15, QWORD PTR [rcx+48] + mov rdi, QWORD PTR [rcx+64] + mov rsi, QWORD PTR [rcx+-80] + mov rbx, QWORD PTR [rcx+-24] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-80], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-24], rbp + mov QWORD PTR [rcx+-8], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+56] + mov r15, QWORD PTR [rcx+72] + mov rdi, QWORD PTR [rcx+-72] + mov rsi, QWORD PTR [rcx+-56] + mov rbx, QWORD PTR [rcx] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+72], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-72], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-56], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx], rbp + mov QWORD PTR [rcx+56], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+80] + xor r12, QWORD PTR [rcx+-64] + xor r13, QWORD PTR [rcx+-48] + xor rax, QWORD PTR [rcx+8] + xor r10, QWORD PTR [rcx+24] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+80], r14 + mov QWORD PTR [rcx+-64], r15 + mov QWORD PTR [rcx+-48], rdi + mov QWORD PTR [rcx+8], rsi + mov QWORD PTR [rcx+24], rbx + ; Round 22 + xor r14, r9 + xor rbx, QWORD PTR [rcx+-88] + xor rsi, QWORD PTR [rcx+-80] + xor rdi, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-56] + xor r15, QWORD PTR [rcx+-40] + xor r14, QWORD PTR [rcx+-32] + xor rbx, QWORD PTR [rcx+-24] + xor r15, QWORD PTR [rcx+-16] + xor r14, QWORD PTR [rcx+-8] + xor rbx, QWORD PTR [rcx] + xor rdi, QWORD PTR [rcx+16] + xor rsi, QWORD PTR [rcx+32] + xor rdi, QWORD PTR [rcx+40] + xor r15, QWORD PTR [rcx+48] + xor r14, QWORD PTR [rcx+56] + xor rdi, QWORD PTR [rcx+64] + xor r15, QWORD PTR [rcx+72] + xor rbx, QWORD PTR [rcx+88] + xor rsi, QWORD PTR [rcx+96] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-16] + mov rdi, QWORD PTR [rcx+64] + mov rsi, QWORD PTR [rcx+-56] + mov rbx, QWORD PTR [rcx+24] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-16], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+64], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-56], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+24], rsi + ; XOR in constant + mov rbx, 2147483649 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+32] + mov r15, QWORD PTR [rcx+-88] + mov rdi, QWORD PTR [rcx+-8] + mov rsi, QWORD PTR [rcx+72] + mov rbx, QWORD PTR [rcx+-48] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-8], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+72], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-48], rbp + mov QWORD PTR [rcx+32], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-40] + mov r15, QWORD PTR [rcx+40] + mov rdi, QWORD PTR [rcx+-80] + mov rsi, QWORD PTR [rcx] + mov rbx, QWORD PTR [rcx+80] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+40], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+80], rbp + mov QWORD PTR [rcx+-40], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+88] + mov r15, QWORD PTR [rcx+-32] + mov rdi, QWORD PTR [rcx+48] + mov rsi, QWORD PTR [rcx+-72] + mov rbx, QWORD PTR [rcx+8] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+48], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-72], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+8], rbp + mov QWORD PTR [rcx+88], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+16] + xor r12, QWORD PTR [rcx+96] + xor r13, QWORD PTR [rcx+-24] + xor rax, QWORD PTR [rcx+56] + xor r10, QWORD PTR [rcx+-64] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+16], r14 + mov QWORD PTR [rcx+96], r15 + mov QWORD PTR [rcx+-24], rdi + mov QWORD PTR [rcx+56], rsi + mov QWORD PTR [rcx+-64], rbx + ; Round 23 + xor r14, r9 + xor r15, QWORD PTR [rcx+-88] + xor rdi, QWORD PTR [rcx+-80] + xor rsi, QWORD PTR [rcx+-72] + xor rsi, QWORD PTR [rcx+-56] + xor rbx, QWORD PTR [rcx+-48] + xor r14, QWORD PTR [rcx+-40] + xor r15, QWORD PTR [rcx+-32] + xor r15, QWORD PTR [rcx+-16] + xor rdi, QWORD PTR [rcx+-8] + xor rsi, QWORD PTR [rcx] + xor rbx, QWORD PTR [rcx+8] + xor rbx, QWORD PTR [rcx+24] + xor r14, QWORD PTR [rcx+32] + xor r15, QWORD PTR [rcx+40] + xor rdi, QWORD PTR [rcx+48] + xor rdi, QWORD PTR [rcx+64] + xor rsi, QWORD PTR [rcx+72] + xor rbx, QWORD PTR [rcx+80] + xor r14, QWORD PTR [rcx+88] + ; Calc t[0..4] + rorx rax, r15, 63 + rorx r10, rdi, 63 + rorx r11, rsi, 63 + rorx r12, rbx, 63 + rorx r13, r14, 63 + xor rax, rbx + xor r10, r14 + xor r11, r15 + xor r12, rdi + xor r13, rsi + ; Row Mix + ; Row 0 + mov r14, r9 + mov r15, QWORD PTR [rcx+-88] + mov rdi, QWORD PTR [rcx+-80] + mov rsi, QWORD PTR [rcx+-72] + mov rbx, QWORD PTR [rcx+-64] + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + rol r15, 44 + rol rdi, 43 + rol rsi, 21 + rol rbx, 14 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-88], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-80], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-72], rbp + andn rsi, r14, r15 + andn r9, r15, rdi + xor rsi, rbx + xor r9, r14 + mov QWORD PTR [rcx+-64], rsi + ; XOR in constant + mov rbx, 9223372039002292232 + xor r9, rbx + ; Row 1 + mov r14, QWORD PTR [rcx+-56] + mov r15, QWORD PTR [rcx+-48] + mov rdi, QWORD PTR [rcx+-40] + mov rsi, QWORD PTR [rcx+-32] + mov rbx, QWORD PTR [rcx+-24] + xor r14, r12 + xor r15, r13 + xor rdi, rax + xor rsi, r10 + xor rbx, r11 + rol r14, 28 + rol r15, 20 + rol rdi, 3 + rol rsi, 45 + rol rbx, 61 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-48], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+-40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+-32], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+-24], rbp + mov QWORD PTR [rcx+-56], rsi + ; Row 2 + mov r14, QWORD PTR [rcx+-16] + mov r15, QWORD PTR [rcx+-8] + mov rdi, QWORD PTR [rcx] + mov rsi, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+16] + xor r14, r10 + xor r15, r11 + xor rdi, r12 + xor rsi, r13 + xor rbx, rax + rol r14, 1 + rol r15, 6 + rol rdi, 25 + rol rsi, 8 + rol rbx, 18 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+-8], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+8], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+16], rbp + mov QWORD PTR [rcx+-16], rsi + ; Row 3 + mov r14, QWORD PTR [rcx+24] + mov r15, QWORD PTR [rcx+32] + mov rdi, QWORD PTR [rcx+40] + mov rsi, QWORD PTR [rcx+48] + mov rbx, QWORD PTR [rcx+56] + xor r14, r13 + xor r15, rax + xor rdi, r10 + xor rsi, r11 + xor rbx, r12 + rol r14, 27 + rol r15, 36 + rol rdi, 10 + rol rsi, 15 + rol rbx, 56 + andn rbp, rdi, rsi + xor rbp, r15 + mov QWORD PTR [rcx+32], rbp + andn rbp, rsi, rbx + xor rbp, rdi + mov QWORD PTR [rcx+40], rbp + andn rbp, rbx, r14 + xor rbp, rsi + mov QWORD PTR [rcx+48], rbp + andn rbp, r14, r15 + andn rsi, r15, rdi + xor rbp, rbx + xor rsi, r14 + mov QWORD PTR [rcx+56], rbp + mov QWORD PTR [rcx+24], rsi + ; Row 4 + xor r11, QWORD PTR [rcx+64] + xor r12, QWORD PTR [rcx+72] + xor r13, QWORD PTR [rcx+80] + xor rax, QWORD PTR [rcx+88] + xor r10, QWORD PTR [rcx+96] + rorx r14, r11, 2 + rorx r15, r12, 9 + rorx rdi, r13, 25 + rorx rsi, rax, 23 + rorx rbx, r10, 62 + andn rax, r15, rdi + andn r10, rdi, rsi + andn r11, rsi, rbx + andn r12, rbx, r14 + andn r13, r14, r15 + xor r14, rax + xor r15, r10 + xor rdi, r11 + xor rsi, r12 + xor rbx, r13 + mov QWORD PTR [rcx+64], r14 + mov QWORD PTR [rcx+72], r15 + mov QWORD PTR [rcx+80], rdi + mov QWORD PTR [rcx+88], rsi + mov QWORD PTR [rcx+96], rbx + add rdx, QWORD PTR [rsp] + sub r8d, 1 + mov rbp, QWORD PTR [rsp] + jg L_sha3_block_n_bmi2_start + mov QWORD PTR [rcx+-96], r9 + pop rbp + pop rbp + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +sha3_block_n_bmi2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_sha3_block_avx2_rotl QWORD 0000000000000001h, 000000000000003eh + QWORD 000000000000001ch, 000000000000001bh + QWORD 000000000000002ch, 0000000000000006h + QWORD 0000000000000037h, 0000000000000014h + QWORD 000000000000000ah, 000000000000002bh + QWORD 0000000000000019h, 0000000000000027h + QWORD 000000000000002dh, 000000000000000fh + QWORD 0000000000000015h, 0000000000000008h + QWORD 0000000000000024h, 0000000000000003h + QWORD 0000000000000029h, 0000000000000012h + QWORD 0000000000000002h, 000000000000003dh + QWORD 0000000000000038h, 000000000000000eh +ptr_L_sha3_block_avx2_rotl QWORD L_sha3_block_avx2_rotl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_sha3_block_avx2_rotr QWORD 000000000000003fh, 0000000000000002h + QWORD 0000000000000024h, 0000000000000025h + QWORD 0000000000000014h, 000000000000003ah + QWORD 0000000000000009h, 000000000000002ch + QWORD 0000000000000036h, 0000000000000015h + QWORD 0000000000000027h, 0000000000000019h + QWORD 0000000000000013h, 0000000000000031h + QWORD 000000000000002bh, 0000000000000038h + QWORD 000000000000001ch, 000000000000003dh + QWORD 0000000000000017h, 000000000000002eh + QWORD 000000000000003eh, 0000000000000003h + QWORD 0000000000000008h, 0000000000000032h +ptr_L_sha3_block_avx2_rotr QWORD L_sha3_block_avx2_rotr +_DATA ENDS +_TEXT SEGMENT READONLY PARA +sha3_block_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + mov rdx, QWORD PTR [ptr_L_sha3_avx2_r] + mov rax, QWORD PTR [ptr_L_sha3_block_avx2_rotl] + add rax, 64 + mov r8, QWORD PTR [ptr_L_sha3_block_avx2_rotr] + add r8, 64 + mov r9, 24 + vpbroadcastq ymm0, QWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+8] + vmovdqu ymm2, YMMWORD PTR [rcx+40] + vmovdqu ymm3, YMMWORD PTR [rcx+72] + vmovdqu ymm4, YMMWORD PTR [rcx+104] + vmovdqu ymm5, YMMWORD PTR [rcx+136] + vmovdqu ymm6, YMMWORD PTR [rcx+168] + vpermq ymm7, ymm2, 57 + vpermq ymm8, ymm3, 30 + vpermq ymm9, ymm4, 75 + vpermq ymm10, ymm5, 147 + vpblendd ymm11, ymm2, ymm3, 12 + vpblendd ymm12, ymm4, ymm5, 192 + vpblendd ymm2, ymm7, ymm8, 192 + vpblendd ymm3, ymm8, ymm9, 240 + vpblendd ymm4, ymm10, ymm9, 3 + vpblendd ymm5, ymm11, ymm12, 240 +L_sha3_block_avx2_start: + ; Calc b[0..4] + vpshufd ymm7, ymm5, 238 + vpxor ymm15, ymm1, ymm2 + vpxor ymm14, ymm5, ymm7 + vpxor ymm12, ymm3, ymm4 + vpermq ymm7, ymm14, 170 + vpxor ymm14, ymm14, ymm0 + vpxor ymm14, ymm14, ymm7 + vpxor ymm15, ymm15, ymm6 + vpxor ymm15, ymm15, ymm12 + vpermq ymm14, ymm14, 0 + ; XOR in b[x+4] + vpermq ymm7, ymm15, 147 + vpermq ymm9, ymm15, 57 + vpermq ymm10, ymm15, 0 + vpermq ymm15, ymm15, 255 + vpblendd ymm9, ymm9, ymm14, 192 + vpblendd ymm14, ymm7, ymm14, 3 + ; Rotate left 1 + vpsrlq ymm8, ymm10, 63 + vpaddq ymm10, ymm10, ymm10 + vpsrlq ymm7, ymm9, 63 + vpaddq ymm9, ymm9, ymm9 + vpor ymm10, ymm10, ymm8 + vpor ymm9, ymm9, ymm7 + vpxor ymm10, ymm10, ymm15 + vpxor ymm9, ymm9, ymm14 + ; XOR in ROTL64(b[x+1]) + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm9 + vpxor ymm4, ymm4, ymm9 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm9 + ; Shuffle - Rotate + vmovdqu ymm7, YMMWORD PTR [r8+-64] + vmovdqu ymm9, YMMWORD PTR [r8+-32] + vmovdqu ymm11, YMMWORD PTR [r8] + vmovdqu ymm8, YMMWORD PTR [rax+-64] + vmovdqu ymm10, YMMWORD PTR [rax+-32] + vmovdqu ymm12, YMMWORD PTR [rax] + vpsrlvq ymm7, ymm1, ymm7 + vpsrlvq ymm9, ymm2, ymm9 + vpsrlvq ymm11, ymm3, ymm11 + vpsllvq ymm1, ymm1, ymm8 + vpsllvq ymm2, ymm2, ymm10 + vpsllvq ymm3, ymm3, ymm12 + vpor ymm1, ymm1, ymm7 + vpor ymm2, ymm2, ymm9 + vpor ymm3, ymm3, ymm11 + vmovdqu ymm7, YMMWORD PTR [r8+32] + vmovdqu ymm9, YMMWORD PTR [r8+64] + vmovdqu ymm11, YMMWORD PTR [r8+96] + vmovdqu ymm8, YMMWORD PTR [rax+32] + vmovdqu ymm10, YMMWORD PTR [rax+64] + vmovdqu ymm12, YMMWORD PTR [rax+96] + vpsrlvq ymm7, ymm4, ymm7 + vpsrlvq ymm9, ymm5, ymm9 + vpsrlvq ymm11, ymm6, ymm11 + vpsllvq ymm4, ymm4, ymm8 + vpsllvq ymm5, ymm5, ymm10 + vpsllvq ymm6, ymm6, ymm12 + vpor ymm4, ymm4, ymm7 + vpor ymm5, ymm5, ymm9 + vpor ymm6, ymm6, ymm11 + ; Row Mix + vpermq ymm12, ymm2, 0 + vpermq ymm13, ymm3, 85 + vpermq ymm14, ymm4, 170 + vpermq ymm15, ymm6, 255 + vpandn ymm7, ymm13, ymm14 + vpandn ymm8, ymm14, ymm15 + vpandn ymm9, ymm15, ymm0 + vpandn ymm10, ymm0, ymm12 + vpandn ymm11, ymm12, ymm13 + vpxor ymm12, ymm12, ymm7 + vpxor ymm13, ymm13, ymm8 + vpxor ymm14, ymm14, ymm9 + vpxor ymm15, ymm15, ymm10 + vpxor ymm0, ymm0, ymm11 + vpermq ymm7, ymm5, 141 + vpblendd ymm10, ymm12, ymm13, 12 + vpermq ymm11, ymm1, 114 + vpblendd ymm9, ymm14, ymm15, 192 + vpermq ymm12, ymm2, 135 + vpblendd ymm1, ymm10, ymm9, 240 + vpermq ymm13, ymm3, 201 + vpermq ymm14, ymm4, 156 + vpermq ymm15, ymm6, 45 + vpblendd ymm12, ymm12, ymm7, 48 + vpblendd ymm13, ymm13, ymm7, 3 + vpblendd ymm14, ymm14, ymm7, 192 + vpblendd ymm15, ymm15, ymm7, 12 + vpandn ymm5, ymm12, ymm13 + vpandn ymm7, ymm13, ymm14 + vpandn ymm2, ymm14, ymm15 + vpandn ymm3, ymm15, ymm11 + vpandn ymm4, ymm11, ymm12 + vpxor ymm5, ymm11, ymm5 + vpxor ymm12, ymm12, ymm7 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vpxor ymm15, ymm15, ymm4 + vperm2i128 ymm3, ymm12, ymm14, 32 + vperm2i128 ymm7, ymm13, ymm15, 32 + vperm2i128 ymm6, ymm12, ymm14, 49 + vperm2i128 ymm8, ymm13, ymm15, 49 + vpunpcklqdq ymm2, ymm3, ymm7 + vpunpckhqdq ymm3, ymm3, ymm7 + vpunpcklqdq ymm4, ymm6, ymm8 + vpunpckhqdq ymm6, ymm6, ymm8 + vpxor ymm0, ymm0, [rdx] + add rdx, 32 + sub r9, 1 + jnz L_sha3_block_avx2_start + vpermq ymm7, ymm2, 147 + vpermq ymm8, ymm3, 78 + vpermq ymm9, ymm4, 57 + vpblendd ymm2, ymm7, ymm5, 3 + vpblendd ymm3, ymm8, ymm7, 3 + vpblendd ymm3, ymm3, ymm5, 12 + vpblendd ymm4, ymm8, ymm9, 192 + vpblendd ymm4, ymm4, ymm5, 48 + vpblendd ymm5, ymm9, ymm5, 192 + vmovq QWORD PTR [rcx], xmm0 + vmovdqu YMMWORD PTR [rcx+8], ymm1 + vmovdqu YMMWORD PTR [rcx+40], ymm2 + vmovdqu YMMWORD PTR [rcx+72], ymm3 + vmovdqu YMMWORD PTR [rcx+104], ymm4 + vmovdqu YMMWORD PTR [rcx+136], ymm5 + vmovdqu YMMWORD PTR [rcx+168], ymm6 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +sha3_block_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_sha3_block_n_avx2_rotl QWORD 0000000000000001h, 000000000000003eh + QWORD 000000000000001ch, 000000000000001bh + QWORD 000000000000002ch, 0000000000000006h + QWORD 0000000000000037h, 0000000000000014h + QWORD 000000000000000ah, 000000000000002bh + QWORD 0000000000000019h, 0000000000000027h + QWORD 000000000000002dh, 000000000000000fh + QWORD 0000000000000015h, 0000000000000008h + QWORD 0000000000000024h, 0000000000000003h + QWORD 0000000000000029h, 0000000000000012h + QWORD 0000000000000002h, 000000000000003dh + QWORD 0000000000000038h, 000000000000000eh +ptr_L_sha3_block_n_avx2_rotl QWORD L_sha3_block_n_avx2_rotl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_sha3_block_n_avx2_rotr QWORD 000000000000003fh, 0000000000000002h + QWORD 0000000000000024h, 0000000000000025h + QWORD 0000000000000014h, 000000000000003ah + QWORD 0000000000000009h, 000000000000002ch + QWORD 0000000000000036h, 0000000000000015h + QWORD 0000000000000027h, 0000000000000019h + QWORD 0000000000000013h, 0000000000000031h + QWORD 000000000000002bh, 0000000000000038h + QWORD 000000000000001ch, 000000000000003dh + QWORD 0000000000000017h, 000000000000002eh + QWORD 000000000000003eh, 0000000000000003h + QWORD 0000000000000008h, 0000000000000032h +ptr_L_sha3_block_n_avx2_rotr QWORD L_sha3_block_n_avx2_rotr +_DATA ENDS +_TEXT SEGMENT READONLY PARA +sha3_block_n_avx2 PROC + push r12 + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + mov rax, QWORD PTR [ptr_L_sha3_avx2_r] + mov r10, QWORD PTR [ptr_L_sha3_block_n_avx2_rotl] + add r10, 64 + mov r11, QWORD PTR [ptr_L_sha3_block_n_avx2_rotr] + add r11, 64 + vpbroadcastq ymm0, QWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+8] + vmovdqu ymm2, YMMWORD PTR [rcx+40] + vmovdqu ymm3, YMMWORD PTR [rcx+72] + vmovdqu ymm4, YMMWORD PTR [rcx+104] + vmovdqu ymm5, YMMWORD PTR [rcx+136] + vmovdqu ymm6, YMMWORD PTR [rcx+168] + mov r12, 24 + cmp r9, 136 + je L_sha3_block_n_avx2_load_256_1 + cmp r9, 168 + je L_sha3_block_n_avx2_load_128_1 + cmp r9, 144 + je L_sha3_block_n_avx2_load_224_1 + cmp r9, 104 + je L_sha3_block_n_avx2_load_384_1 + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpxor ymm2, ymm2, ymm9 + jmp L_sha3_block_n_avx2_start_1 +L_sha3_block_n_avx2_load_128_1: + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vmovdqu ymm11, YMMWORD PTR [rdx+104] + vmovdqu ymm12, YMMWORD PTR [rdx+136] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm10 + vpxor ymm4, ymm4, ymm11 + vpxor ymm5, ymm5, ymm12 + jmp L_sha3_block_n_avx2_start_1 +L_sha3_block_n_avx2_load_224_1: + vpxor ymm12, ymm12, ymm12 + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vmovdqu ymm11, YMMWORD PTR [rdx+104] + vmovq xmm12, QWORD PTR [rdx+136] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm10 + vpxor ymm4, ymm4, ymm11 + vpxor ymm5, ymm5, ymm12 + jmp L_sha3_block_n_avx2_start_1 +L_sha3_block_n_avx2_load_384_1: + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm10 + jmp L_sha3_block_n_avx2_start_1 +L_sha3_block_n_avx2_load_256_1: + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vmovdqu ymm11, YMMWORD PTR [rdx+104] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm10 + vpxor ymm4, ymm4, ymm11 +L_sha3_block_n_avx2_start_1: + vpermq ymm7, ymm2, 57 + vpermq ymm8, ymm3, 30 + vpermq ymm9, ymm4, 75 + vpermq ymm10, ymm5, 147 + vpblendd ymm11, ymm2, ymm3, 12 + vpblendd ymm12, ymm4, ymm5, 192 + vpblendd ymm2, ymm7, ymm8, 192 + vpblendd ymm3, ymm8, ymm9, 240 + vpblendd ymm4, ymm10, ymm9, 3 + vpblendd ymm5, ymm11, ymm12, 240 + jmp L_sha3_block_n_avx2_rounds +L_sha3_block_n_avx2_start: + mov r12, 24 + cmp r9, 136 + je L_sha3_block_n_avx2_load_256 + cmp r9, 168 + je L_sha3_block_n_avx2_load_128 + cmp r9, 144 + je L_sha3_block_n_avx2_load_224 + cmp r9, 104 + je L_sha3_block_n_avx2_load_384 + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vpxor ymm12, ymm12, ymm12 + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpermq ymm7, ymm9, 57 + vpblendd ymm15, ymm9, ymm12, 252 + vpblendd ymm7, ymm7, ymm12, 192 + vpxor ymm2, ymm2, ymm7 + vpxor ymm5, ymm5, ymm15 + jmp L_sha3_block_n_avx2_rounds +L_sha3_block_n_avx2_load_128: + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vmovdqu ymm11, YMMWORD PTR [rdx+104] + vmovdqu ymm12, YMMWORD PTR [rdx+136] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpermq ymm7, ymm9, 57 + vpermq ymm8, ymm10, 30 + vpermq ymm13, ymm11, 75 + vpermq ymm14, ymm12, 147 + vpblendd ymm15, ymm9, ymm10, 12 + vpblendd ymm11, ymm11, ymm12, 192 + vpblendd ymm7, ymm7, ymm8, 192 + vpblendd ymm8, ymm8, ymm13, 240 + vpblendd ymm13, ymm14, ymm13, 3 + vpblendd ymm11, ymm15, ymm11, 240 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm8 + vpxor ymm4, ymm4, ymm13 + vpxor ymm5, ymm5, ymm11 + jmp L_sha3_block_n_avx2_rounds +L_sha3_block_n_avx2_load_224: + vpxor ymm12, ymm12, ymm12 + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vmovdqu ymm11, YMMWORD PTR [rdx+104] + vmovq xmm12, QWORD PTR [rdx+136] + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpermq ymm7, ymm9, 57 + vpermq ymm8, ymm10, 30 + vpermq ymm13, ymm11, 75 + vpermq ymm14, ymm12, 147 + vpblendd ymm15, ymm9, ymm10, 12 + vpblendd ymm11, ymm11, ymm12, 192 + vpblendd ymm7, ymm7, ymm8, 192 + vpblendd ymm8, ymm8, ymm13, 240 + vpblendd ymm13, ymm14, ymm13, 3 + vpblendd ymm11, ymm15, ymm11, 240 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm8 + vpxor ymm4, ymm4, ymm13 + vpxor ymm5, ymm5, ymm11 + jmp L_sha3_block_n_avx2_rounds +L_sha3_block_n_avx2_load_384: + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vpxor ymm12, ymm12, ymm12 + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpermq ymm7, ymm9, 57 + vpermq ymm8, ymm10, 30 + vpblendd ymm13, ymm10, ymm12, 243 + vpblendd ymm15, ymm9, ymm13, 252 + vpblendd ymm7, ymm7, ymm8, 192 + vpblendd ymm8, ymm8, ymm12, 240 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm8 + vpxor ymm5, ymm5, ymm15 + jmp L_sha3_block_n_avx2_rounds +L_sha3_block_n_avx2_load_256: + vpbroadcastq ymm7, QWORD PTR [rdx] + vmovdqu ymm8, YMMWORD PTR [rdx+8] + vmovdqu ymm9, YMMWORD PTR [rdx+40] + vmovdqu ymm10, YMMWORD PTR [rdx+72] + vmovdqu ymm11, YMMWORD PTR [rdx+104] + vpxor ymm12, ymm12, ymm12 + vpxor ymm0, ymm0, ymm7 + vpxor ymm1, ymm1, ymm8 + vpermq ymm7, ymm9, 57 + vpermq ymm8, ymm10, 30 + vpermq ymm13, ymm11, 75 + vpblendd ymm15, ymm9, ymm10, 12 + vpblendd ymm11, ymm11, ymm12, 207 + vpblendd ymm7, ymm7, ymm8, 192 + vpblendd ymm8, ymm8, ymm13, 240 + vpblendd ymm13, ymm13, ymm12, 252 + vpblendd ymm11, ymm15, ymm11, 240 + vpxor ymm2, ymm2, ymm7 + vpxor ymm3, ymm3, ymm8 + vpxor ymm4, ymm4, ymm13 + vpxor ymm5, ymm5, ymm11 +L_sha3_block_n_avx2_rounds: + ; Calc b[0..4] + vpshufd ymm7, ymm5, 238 + vpxor ymm15, ymm1, ymm2 + vpxor ymm14, ymm5, ymm7 + vpxor ymm12, ymm3, ymm4 + vpermq ymm7, ymm14, 170 + vpxor ymm14, ymm14, ymm0 + vpxor ymm14, ymm14, ymm7 + vpxor ymm15, ymm15, ymm6 + vpxor ymm15, ymm15, ymm12 + vpermq ymm14, ymm14, 0 + ; XOR in b[x+4] + vpermq ymm7, ymm15, 147 + vpermq ymm9, ymm15, 57 + vpermq ymm10, ymm15, 0 + vpermq ymm15, ymm15, 255 + vpblendd ymm9, ymm9, ymm14, 192 + vpblendd ymm14, ymm7, ymm14, 3 + ; Rotate left 1 + vpsrlq ymm8, ymm10, 63 + vpaddq ymm10, ymm10, ymm10 + vpsrlq ymm7, ymm9, 63 + vpaddq ymm9, ymm9, ymm9 + vpor ymm10, ymm10, ymm8 + vpor ymm9, ymm9, ymm7 + vpxor ymm10, ymm10, ymm15 + vpxor ymm9, ymm9, ymm14 + ; XOR in ROTL64(b[x+1]) + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm9 + vpxor ymm3, ymm3, ymm9 + vpxor ymm4, ymm4, ymm9 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm9 + ; Shuffle - Rotate + vmovdqu ymm7, YMMWORD PTR [r11+-64] + vmovdqu ymm9, YMMWORD PTR [r11+-32] + vmovdqu ymm11, YMMWORD PTR [r11] + vmovdqu ymm8, YMMWORD PTR [r10+-64] + vmovdqu ymm10, YMMWORD PTR [r10+-32] + vmovdqu ymm12, YMMWORD PTR [r10] + vpsrlvq ymm7, ymm1, ymm7 + vpsrlvq ymm9, ymm2, ymm9 + vpsrlvq ymm11, ymm3, ymm11 + vpsllvq ymm1, ymm1, ymm8 + vpsllvq ymm2, ymm2, ymm10 + vpsllvq ymm3, ymm3, ymm12 + vpor ymm1, ymm1, ymm7 + vpor ymm2, ymm2, ymm9 + vpor ymm3, ymm3, ymm11 + vmovdqu ymm7, YMMWORD PTR [r11+32] + vmovdqu ymm9, YMMWORD PTR [r11+64] + vmovdqu ymm11, YMMWORD PTR [r11+96] + vmovdqu ymm8, YMMWORD PTR [r10+32] + vmovdqu ymm10, YMMWORD PTR [r10+64] + vmovdqu ymm12, YMMWORD PTR [r10+96] + vpsrlvq ymm7, ymm4, ymm7 + vpsrlvq ymm9, ymm5, ymm9 + vpsrlvq ymm11, ymm6, ymm11 + vpsllvq ymm4, ymm4, ymm8 + vpsllvq ymm5, ymm5, ymm10 + vpsllvq ymm6, ymm6, ymm12 + vpor ymm4, ymm4, ymm7 + vpor ymm5, ymm5, ymm9 + vpor ymm6, ymm6, ymm11 + ; Row Mix + vpermq ymm12, ymm2, 0 + vpermq ymm13, ymm3, 85 + vpermq ymm14, ymm4, 170 + vpermq ymm15, ymm6, 255 + vpandn ymm7, ymm13, ymm14 + vpandn ymm8, ymm14, ymm15 + vpandn ymm9, ymm15, ymm0 + vpandn ymm10, ymm0, ymm12 + vpandn ymm11, ymm12, ymm13 + vpxor ymm12, ymm12, ymm7 + vpxor ymm13, ymm13, ymm8 + vpxor ymm14, ymm14, ymm9 + vpxor ymm15, ymm15, ymm10 + vpxor ymm0, ymm0, ymm11 + vpermq ymm7, ymm5, 141 + vpblendd ymm10, ymm12, ymm13, 12 + vpermq ymm11, ymm1, 114 + vpblendd ymm9, ymm14, ymm15, 192 + vpermq ymm12, ymm2, 135 + vpblendd ymm1, ymm10, ymm9, 240 + vpermq ymm13, ymm3, 201 + vpermq ymm14, ymm4, 156 + vpermq ymm15, ymm6, 45 + vpblendd ymm12, ymm12, ymm7, 48 + vpblendd ymm13, ymm13, ymm7, 3 + vpblendd ymm14, ymm14, ymm7, 192 + vpblendd ymm15, ymm15, ymm7, 12 + vpandn ymm5, ymm12, ymm13 + vpandn ymm7, ymm13, ymm14 + vpandn ymm2, ymm14, ymm15 + vpandn ymm3, ymm15, ymm11 + vpandn ymm4, ymm11, ymm12 + vpxor ymm5, ymm11, ymm5 + vpxor ymm12, ymm12, ymm7 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vpxor ymm15, ymm15, ymm4 + vperm2i128 ymm3, ymm12, ymm14, 32 + vperm2i128 ymm7, ymm13, ymm15, 32 + vperm2i128 ymm6, ymm12, ymm14, 49 + vperm2i128 ymm8, ymm13, ymm15, 49 + vpunpcklqdq ymm2, ymm3, ymm7 + vpunpckhqdq ymm3, ymm3, ymm7 + vpunpcklqdq ymm4, ymm6, ymm8 + vpunpckhqdq ymm6, ymm6, ymm8 + vpxor ymm0, ymm0, [rax] + add rax, 32 + sub r12, 1 + jnz L_sha3_block_n_avx2_rounds + sub rax, 768 + add rdx, r9 + sub r8d, 1 + jnz L_sha3_block_n_avx2_start + vpermq ymm7, ymm2, 147 + vpermq ymm8, ymm3, 78 + vpermq ymm9, ymm4, 57 + vpblendd ymm2, ymm7, ymm5, 3 + vpblendd ymm3, ymm8, ymm7, 3 + vpblendd ymm3, ymm3, ymm5, 12 + vpblendd ymm4, ymm8, ymm9, 192 + vpblendd ymm4, ymm4, ymm5, 48 + vpblendd ymm5, ymm9, ymm5, 192 + vmovq QWORD PTR [rcx], xmm0 + vmovdqu YMMWORD PTR [rcx+8], ymm1 + vmovdqu YMMWORD PTR [rcx+40], ymm2 + vmovdqu YMMWORD PTR [rcx+72], ymm3 + vmovdqu YMMWORD PTR [rcx+104], ymm4 + vmovdqu YMMWORD PTR [rcx+136], ymm5 + vmovdqu YMMWORD PTR [rcx+168], ymm6 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r12 + ret +sha3_block_n_avx2 ENDP +_TEXT ENDS +wc_masm_cond_0 = 0 +IFDEF WOLFSSL_HAVE_MLKEM +wc_masm_cond_0 = 1 +ENDIF +IFDEF WOLFSSL_HAVE_MLDSA +wc_masm_cond_0 = 1 +ENDIF +IFDEF WOLFSSL_HAVE_SLHDSA +wc_masm_cond_0 = 1 +ENDIF +IF wc_masm_cond_0 +_TEXT SEGMENT READONLY PARA +sha3_blocksx4_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + mov rdx, QWORD PTR [ptr_L_sha3_x4_avx2_r] + vmovdqu ymm15, YMMWORD PTR [rcx] + mov rax, rcx + mov r8, rcx + add rcx, 128 + add rax, 384 + add r8, 640 + ; Round 0 + ; Calc b[0..4] + vmovdqu ymm11, YMMWORD PTR [rcx+-96] + vmovdqu ymm12, YMMWORD PTR [rcx+-64] + vmovdqu ymm13, YMMWORD PTR [rcx+-32] + vmovdqu ymm14, YMMWORD PTR [rcx] + vpxor ymm10, ymm15, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm14, ymm14, [rax+-96] + vpxor ymm10, ymm10, [rax+-64] + vpxor ymm11, ymm11, [rax+-32] + vpxor ymm12, ymm12, [rax] + vpxor ymm13, ymm13, [rax+32] + vpxor ymm14, ymm14, [rax+64] + vpxor ymm10, ymm10, [rax+96] + vpxor ymm11, ymm11, [rax+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm14, ymm14, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+64] + vpxor ymm12, ymm7, [rax] + vpxor ymm13, ymm8, [r8+-64] + vpxor ymm14, ymm9, [r8+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rax], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-32] + vpxor ymm11, ymm9, [rax+-96] + vpxor ymm12, ymm5, [rax+-64] + vpxor ymm13, ymm6, [rax+128] + vpxor ymm14, ymm7, [r8+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [rax+-96], ymm1 + vmovdqu YMMWORD PTR [rax+-64], ymm2 + vmovdqu YMMWORD PTR [rax+128], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-96] + vpxor ymm11, ymm7, [rcx+96] + vpxor ymm12, ymm8, [rax+32] + vpxor ymm13, ymm9, [r8+-32] + vpxor ymm14, ymm5, [r8] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [rax+32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx] + vpxor ymm11, ymm5, [rcx+32] + vpxor ymm12, ymm6, [rax+-32] + vpxor ymm13, ymm7, [r8+-96] + vpxor ymm14, ymm8, [r8+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rax+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-64] + vpxor ymm11, ymm8, [rcx+128] + vpxor ymm12, ymm9, [rax+64] + vpxor ymm13, ymm5, [rax+96] + vpxor ymm14, ymm6, [r8+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rax+64], ymm2 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Round 1 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm11, ymm11, [rax+-96] + vpxor ymm12, ymm2, [rax+-64] + vpxor ymm12, ymm12, [rax+-32] + vpxor ymm12, ymm12, [rax] + vpxor ymm12, ymm12, [rax+32] + vpxor ymm13, ymm3, [rax+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm14, ymm4, [r8] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm14, ymm14, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax+-96] + vpxor ymm12, ymm7, [rax+32] + vpxor ymm13, ymm8, [r8+-96] + vpxor ymm14, ymm9, [r8+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+32] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-96], ymm1 + vmovdqu YMMWORD PTR [rax+32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-64] + vpxor ymm11, ymm9, [r8+64] + vpxor ymm12, ymm5, [rcx+-96] + vpxor ymm13, ymm6, [rcx+32] + vpxor ymm14, ymm7, [rax+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [rax+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+64] + vpxor ymm11, ymm7, [rax+-64] + vpxor ymm12, ymm8, [r8+-32] + vpxor ymm13, ymm9, [r8+96] + vpxor ymm14, ymm5, [rcx+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rax+-64], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+128] + vpxor ymm11, ymm5, [rcx+-32] + vpxor ymm12, ymm6, [rcx+96] + vpxor ymm13, ymm7, [rax+-32] + vpxor ymm14, ymm8, [rax+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [rax+-32], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax] + vpxor ymm11, ymm8, [rax+128] + vpxor ymm12, ymm9, [r8] + vpxor ymm13, ymm5, [rcx] + vpxor ymm14, ymm6, [rcx+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax], ymm0 + vmovdqu YMMWORD PTR [rax+128], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Round 2 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm11, ymm11, [rax+-96] + vpxor ymm11, ymm11, [rax+-64] + vpxor ymm13, ymm13, [rax+-32] + vpxor ymm12, ymm12, [rax+32] + vpxor ymm14, ymm14, [rax+64] + vpxor ymm14, ymm14, [rax+96] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm10, ymm10, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+64] + vpxor ymm12, ymm7, [r8+-32] + vpxor ymm13, ymm8, [rax+-32] + vpxor ymm14, ymm9, [rcx+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+64] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [rax+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-96] + vpxor ymm11, ymm9, [rax+64] + vpxor ymm12, ymm5, [rcx+64] + vpxor ymm13, ymm6, [rcx+-32] + vpxor ymm14, ymm7, [r8] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [rax+64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax+-96] + vpxor ymm11, ymm7, [rcx+-96] + vpxor ymm12, ymm8, [r8+96] + vpxor ymm13, ymm9, [rax+96] + vpxor ymm14, ymm5, [rax] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [rax], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+32] + vpxor ymm11, ymm5, [r8+-64] + vpxor ymm12, ymm6, [rax+-64] + vpxor ymm13, ymm7, [rcx+96] + vpxor ymm14, ymm8, [rcx] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [rax+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax+32] + vpxor ymm11, ymm8, [rcx+32] + vpxor ymm12, ymm9, [rcx+-64] + vpxor ymm13, ymm5, [r8+128] + vpxor ymm14, ymm6, [rax+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm4 + ; Round 3 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm2, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm10, ymm10, [rax+-96] + vpxor ymm12, ymm12, [rax+-64] + vpxor ymm13, ymm13, [rax+-32] + vpxor ymm14, ymm14, [rax] + vpxor ymm11, ymm11, [rax+64] + vpxor ymm13, ymm13, [rax+96] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm12, ymm12, [r8+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax+64] + vpxor ymm12, ymm7, [r8+96] + vpxor ymm13, ymm8, [rcx+96] + vpxor ymm14, ymm9, [rax+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+96] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax+-32] + vpxor ymm11, ymm9, [r8] + vpxor ymm12, ymm5, [rax+-96] + vpxor ymm13, ymm6, [r8+-64] + vpxor ymm14, ymm7, [rcx+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-32], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [rax+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+64] + vpxor ymm11, ymm7, [rcx+64] + vpxor ymm12, ymm8, [rax+96] + vpxor ymm13, ymm9, [rcx] + vpxor ymm14, ymm5, [rax+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rax+96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+128] + vpxor ymm11, ymm5, [r8+-96] + vpxor ymm12, ymm6, [rcx+-96] + vpxor ymm13, ymm7, [rax+-64] + vpxor ymm14, ymm8, [r8+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rax+-64], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-32] + vpxor ymm11, ymm8, [rcx+-32] + vpxor ymm12, ymm9, [rax] + vpxor ymm13, ymm5, [r8+32] + vpxor ymm14, ymm6, [rcx+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rax], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Round 4 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm1, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [rax+-96] + vpxor ymm13, ymm13, [rax+-64] + vpxor ymm10, ymm10, [rax+-32] + vpxor ymm14, ymm14, [rax+32] + vpxor ymm11, ymm11, [rax+64] + vpxor ymm12, ymm12, [rax+96] + vpxor ymm14, ymm14, [rax+128] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm11, ymm11, [r8] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm14, ymm14, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8] + vpxor ymm12, ymm7, [rax+96] + vpxor ymm13, ymm8, [rax+-64] + vpxor ymm14, ymm9, [rcx+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+128] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [rax+96], ymm2 + vmovdqu YMMWORD PTR [rax+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+96] + vpxor ymm11, ymm9, [rcx+-64] + vpxor ymm12, ymm5, [r8+64] + vpxor ymm13, ymm6, [r8+-96] + vpxor ymm14, ymm7, [rax] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [rax], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax+64] + vpxor ymm11, ymm7, [rax+-96] + vpxor ymm12, ymm8, [rcx] + vpxor ymm13, ymm9, [r8+128] + vpxor ymm14, ymm5, [r8+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+64], ymm0 + vmovdqu YMMWORD PTR [rax+-96], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax+128] + vpxor ymm11, ymm5, [rax+-32] + vpxor ymm12, ymm6, [rcx+64] + vpxor ymm13, ymm7, [rcx+-96] + vpxor ymm14, ymm8, [r8+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+128], ymm0 + vmovdqu YMMWORD PTR [rax+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+96] + vpxor ymm11, ymm8, [r8+-64] + vpxor ymm12, ymm9, [rax+32] + vpxor ymm13, ymm5, [rcx+128] + vpxor ymm14, ymm6, [rcx+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [rax+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Round 5 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm12, ymm2, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [rax+-96] + vpxor ymm13, ymm13, [rax+-64] + vpxor ymm11, ymm11, [rax+-32] + vpxor ymm14, ymm14, [rax] + vpxor ymm10, ymm10, [rax+64] + vpxor ymm12, ymm12, [rax+96] + vpxor ymm10, ymm10, [rax+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-64] + vpxor ymm12, ymm7, [rcx] + vpxor ymm13, ymm8, [rcx+-96] + vpxor ymm14, ymm9, [rcx+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+160] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax+-64] + vpxor ymm11, ymm9, [rax] + vpxor ymm12, ymm5, [rax+64] + vpxor ymm13, ymm6, [rax+-32] + vpxor ymm14, ymm7, [rax+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-64], ymm0 + vmovdqu YMMWORD PTR [rax], ymm1 + vmovdqu YMMWORD PTR [rax+64], ymm2 + vmovdqu YMMWORD PTR [rax+-32], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8] + vpxor ymm11, ymm7, [r8+64] + vpxor ymm12, ymm8, [r8+128] + vpxor ymm13, ymm9, [r8+32] + vpxor ymm14, ymm5, [r8+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+32] + vpxor ymm11, ymm5, [rcx+96] + vpxor ymm12, ymm6, [rax+-96] + vpxor ymm13, ymm7, [rcx+64] + vpxor ymm14, ymm8, [rcx+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [rax+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax+96] + vpxor ymm11, ymm8, [r8+-96] + vpxor ymm12, ymm9, [r8+-32] + vpxor ymm13, ymm5, [rax+128] + vpxor ymm14, ymm6, [r8+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+96], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [rax+128], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Round 6 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm12, ymm12, [rax+-96] + vpxor ymm10, ymm10, [rax+-64] + vpxor ymm13, ymm13, [rax+-32] + vpxor ymm11, ymm11, [rax] + vpxor ymm14, ymm14, [rax+32] + vpxor ymm12, ymm12, [rax+64] + vpxor ymm10, ymm10, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm12, ymm12, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax] + vpxor ymm12, ymm7, [r8+128] + vpxor ymm13, ymm8, [rcx+64] + vpxor ymm14, ymm9, [r8+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+192] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-96] + vpxor ymm11, ymm9, [rax+32] + vpxor ymm12, ymm5, [r8] + vpxor ymm13, ymm6, [rcx+96] + vpxor ymm14, ymm7, [r8+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [rax+32], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-64] + vpxor ymm11, ymm7, [rax+64] + vpxor ymm12, ymm8, [r8+32] + vpxor ymm13, ymm9, [rcx+128] + vpxor ymm14, ymm5, [rax+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [rax+64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-32] + vpxor ymm11, ymm5, [rax+-64] + vpxor ymm12, ymm6, [r8+64] + vpxor ymm13, ymm7, [rax+-96] + vpxor ymm14, ymm8, [rax+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [rax+-64], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [rax+-96], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx] + vpxor ymm11, ymm8, [rax+-32] + vpxor ymm12, ymm9, [r8+96] + vpxor ymm13, ymm5, [rcx+32] + vpxor ymm14, ymm6, [r8+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rax+-32], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Round 7 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm13, ymm3, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm13, ymm13, [rax+-96] + vpxor ymm11, ymm1, [rax+-64] + vpxor ymm11, ymm11, [rax] + vpxor ymm11, ymm11, [rax+32] + vpxor ymm11, ymm11, [rax+64] + vpxor ymm14, ymm4, [rax+96] + vpxor ymm14, ymm14, [rax+128] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm12, ymm2, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm12, ymm12, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax+32] + vpxor ymm12, ymm7, [r8+32] + vpxor ymm13, ymm8, [rax+-96] + vpxor ymm14, ymm9, [r8+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+224] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+32], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rax+-96], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+64] + vpxor ymm11, ymm9, [r8+-32] + vpxor ymm12, ymm5, [rcx+-64] + vpxor ymm13, ymm6, [rax+-64] + vpxor ymm14, ymm7, [r8+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [rax+-64], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax] + vpxor ymm11, ymm7, [r8] + vpxor ymm12, ymm8, [rcx+128] + vpxor ymm13, ymm9, [rax+128] + vpxor ymm14, ymm5, [rcx] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [rax+128], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-64] + vpxor ymm11, ymm5, [rcx+-96] + vpxor ymm12, ymm6, [rax+64] + vpxor ymm13, ymm7, [r8+64] + vpxor ymm14, ymm8, [rcx+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [rax+64], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+128] + vpxor ymm11, ymm8, [rcx+96] + vpxor ymm12, ymm9, [rax+96] + vpxor ymm13, ymm5, [rcx+-32] + vpxor ymm14, ymm6, [rax+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [rax+96], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [rax+-32], ymm4 + ; Round 8 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm14, ymm4, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm13, ymm3, [rax+-96] + vpxor ymm13, ymm13, [rax+-64] + vpxor ymm10, ymm10, [rax] + vpxor ymm11, ymm11, [rax+32] + vpxor ymm12, ymm12, [rax+64] + vpxor ymm13, ymm13, [rax+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm14, ymm14, [r8+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-32] + vpxor ymm12, ymm7, [rcx+128] + vpxor ymm13, ymm8, [r8+64] + vpxor ymm14, ymm9, [rax+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+256] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rax+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax+-96] + vpxor ymm11, ymm9, [r8+96] + vpxor ymm12, ymm5, [rax] + vpxor ymm13, ymm6, [rcx+-96] + vpxor ymm14, ymm7, [rax+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-96], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rax], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax+32] + vpxor ymm11, ymm7, [rcx+-64] + vpxor ymm12, ymm8, [rax+128] + vpxor ymm13, ymm9, [rcx+32] + vpxor ymm14, ymm5, [r8+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [rax+128], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-96] + vpxor ymm11, ymm5, [rcx+64] + vpxor ymm12, ymm6, [r8] + vpxor ymm13, ymm7, [rax+64] + vpxor ymm14, ymm8, [rcx+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+32] + vpxor ymm11, ymm8, [rax+-64] + vpxor ymm12, ymm9, [rcx] + vpxor ymm13, ymm5, [r8+-64] + vpxor ymm14, ymm6, [rcx+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rax+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Round 9 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm12, ymm2, [rcx+128] + vpxor ymm10, ymm10, [rax+-96] + vpxor ymm14, ymm14, [rax+-32] + vpxor ymm12, ymm12, [rax] + vpxor ymm10, ymm10, [rax+32] + vpxor ymm13, ymm13, [rax+64] + vpxor ymm14, ymm14, [rax+96] + vpxor ymm12, ymm12, [rax+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm14, ymm14, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+96] + vpxor ymm12, ymm7, [rax+128] + vpxor ymm13, ymm8, [rax+64] + vpxor ymm14, ymm9, [rcx+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+288] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rax+128], ymm2 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+64] + vpxor ymm11, ymm9, [rax+96] + vpxor ymm12, ymm5, [rax+32] + vpxor ymm13, ymm6, [rcx+64] + vpxor ymm14, ymm7, [rcx] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [rax+96], ymm1 + vmovdqu YMMWORD PTR [rax+32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-32] + vpxor ymm11, ymm7, [rax] + vpxor ymm12, ymm8, [rcx+32] + vpxor ymm13, ymm9, [rcx+-32] + vpxor ymm14, ymm5, [r8+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [rax], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax+-32] + vpxor ymm11, ymm5, [rax+-96] + vpxor ymm12, ymm6, [rcx+-64] + vpxor ymm13, ymm7, [r8] + vpxor ymm14, ymm8, [r8+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-32], ymm0 + vmovdqu YMMWORD PTR [rax+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+128] + vpxor ymm11, ymm8, [rcx+-96] + vpxor ymm12, ymm9, [r8+128] + vpxor ymm13, ymm5, [r8+-96] + vpxor ymm14, ymm6, [rax+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [rax+-64], ymm4 + ; Round 10 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm1, [rax+-96] + vpxor ymm10, ymm10, [rax+-32] + vpxor ymm11, ymm11, [rax] + vpxor ymm12, ymm12, [rax+32] + vpxor ymm13, ymm13, [rax+64] + vpxor ymm11, ymm11, [rax+96] + vpxor ymm12, ymm12, [rax+128] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm11, ymm11, [r8+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax+96] + vpxor ymm12, ymm7, [rcx+32] + vpxor ymm13, ymm8, [r8] + vpxor ymm14, ymm9, [rax+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+320] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rax+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax+64] + vpxor ymm11, ymm9, [rcx] + vpxor ymm12, ymm5, [r8+-32] + vpxor ymm13, ymm6, [rax+-96] + vpxor ymm14, ymm7, [r8+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [rax+-96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+96] + vpxor ymm11, ymm7, [rax+32] + vpxor ymm12, ymm8, [rcx+-32] + vpxor ymm13, ymm9, [r8+-64] + vpxor ymm14, ymm5, [rcx+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [rax+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+96] + vpxor ymm11, ymm5, [r8+64] + vpxor ymm12, ymm6, [rax] + vpxor ymm13, ymm7, [rcx+-64] + vpxor ymm14, ymm8, [r8+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rax], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax+128] + vpxor ymm11, ymm8, [rcx+64] + vpxor ymm12, ymm9, [r8+32] + vpxor ymm13, ymm5, [rax+-32] + vpxor ymm14, ymm6, [rcx+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+128], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rax+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Round 11 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm14, ymm4, [rcx+128] + vpxor ymm13, ymm13, [rax+-96] + vpxor ymm14, ymm14, [rax+-64] + vpxor ymm12, ymm12, [rax] + vpxor ymm11, ymm11, [rax+32] + vpxor ymm10, ymm10, [rax+64] + vpxor ymm11, ymm11, [rax+96] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm14, ymm14, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx] + vpxor ymm12, ymm7, [rcx+-32] + vpxor ymm13, ymm8, [rcx+-64] + vpxor ymm14, ymm9, [rcx+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+352] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8] + vpxor ymm11, ymm9, [r8+128] + vpxor ymm12, ymm5, [r8+96] + vpxor ymm13, ymm6, [r8+64] + vpxor ymm14, ymm7, [r8+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax+96] + vpxor ymm11, ymm7, [r8+-32] + vpxor ymm12, ymm8, [r8+-64] + vpxor ymm13, ymm9, [r8+-96] + vpxor ymm14, ymm5, [rax+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+96], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax+-64] + vpxor ymm11, ymm5, [rax+64] + vpxor ymm12, ymm6, [rax+32] + vpxor ymm13, ymm7, [rax] + vpxor ymm14, ymm8, [rax+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-64], ymm0 + vmovdqu YMMWORD PTR [rax+64], ymm1 + vmovdqu YMMWORD PTR [rax+32], ymm2 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rax+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+32] + vpxor ymm11, ymm8, [rax+-96] + vpxor ymm12, ymm9, [rcx+128] + vpxor ymm13, ymm5, [rcx+96] + vpxor ymm14, ymm6, [rcx+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rax+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Round 12 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm10, ymm10, [rax+-64] + vpxor ymm14, ymm14, [rax+-32] + vpxor ymm13, ymm13, [rax] + vpxor ymm12, ymm12, [rax+32] + vpxor ymm11, ymm11, [rax+64] + vpxor ymm10, ymm10, [rax+96] + vpxor ymm14, ymm14, [rax+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm11, ymm11, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+128] + vpxor ymm12, ymm7, [r8+-64] + vpxor ymm13, ymm8, [rax] + vpxor ymm14, ymm9, [rcx+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+384] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-64] + vpxor ymm11, ymm9, [r8+32] + vpxor ymm12, ymm5, [rax+96] + vpxor ymm13, ymm6, [rax+64] + vpxor ymm14, ymm7, [rcx+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rax+96], ymm2 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx] + vpxor ymm11, ymm7, [r8+96] + vpxor ymm12, ymm8, [r8+-96] + vpxor ymm13, ymm9, [rax+-32] + vpxor ymm14, ymm5, [rcx+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [rax+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-96] + vpxor ymm11, ymm5, [r8] + vpxor ymm12, ymm6, [r8+-32] + vpxor ymm13, ymm7, [rax+32] + vpxor ymm14, ymm8, [rcx+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [rax+32], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-32] + vpxor ymm11, ymm8, [r8+64] + vpxor ymm12, ymm9, [rax+128] + vpxor ymm13, ymm5, [rax+-64] + vpxor ymm14, ymm6, [rax+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rax+128], ymm2 + vmovdqu YMMWORD PTR [rax+-64], ymm3 + vmovdqu YMMWORD PTR [rax+-96], ymm4 + ; Round 13 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm13, ymm3, [rax+-32] + vpxor ymm13, ymm13, [rax] + vpxor ymm13, ymm13, [rax+32] + vpxor ymm13, ymm13, [rax+64] + vpxor ymm12, ymm2, [rax+96] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm11, ymm1, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm11, ymm11, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+32] + vpxor ymm12, ymm7, [r8+-96] + vpxor ymm13, ymm8, [rax+32] + vpxor ymm14, ymm9, [rax+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+416] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [rax+32], ymm3 + vmovdqu YMMWORD PTR [rax+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax] + vpxor ymm11, ymm9, [rcx+128] + vpxor ymm12, ymm5, [rcx] + vpxor ymm13, ymm6, [r8] + vpxor ymm14, ymm7, [rax+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+128] + vpxor ymm11, ymm7, [rax+96] + vpxor ymm12, ymm8, [rax+-32] + vpxor ymm13, ymm9, [rcx+96] + vpxor ymm14, ymm5, [rcx+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rax+96], ymm1 + vmovdqu YMMWORD PTR [rax+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+64] + vpxor ymm11, ymm5, [rcx+-64] + vpxor ymm12, ymm6, [r8+96] + vpxor ymm13, ymm7, [r8+-32] + vpxor ymm14, ymm8, [rax+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rax+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-64] + vpxor ymm11, ymm8, [rax+64] + vpxor ymm12, ymm9, [rcx+32] + vpxor ymm13, ymm5, [rcx+-96] + vpxor ymm14, ymm6, [r8+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [rax+64], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Round 14 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm13, ymm3, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm14, ymm14, [rax+-96] + vpxor ymm14, ymm14, [rax+-64] + vpxor ymm12, ymm12, [rax+-32] + vpxor ymm10, ymm10, [rax] + vpxor ymm13, ymm13, [rax+32] + vpxor ymm11, ymm11, [rax+96] + vpxor ymm14, ymm14, [rax+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm10, ymm10, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+128] + vpxor ymm12, ymm7, [rax+-32] + vpxor ymm13, ymm8, [r8+-32] + vpxor ymm14, ymm9, [r8+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+448] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rax+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax+32] + vpxor ymm11, ymm9, [rax+128] + vpxor ymm12, ymm5, [r8+128] + vpxor ymm13, ymm6, [rcx+-64] + vpxor ymm14, ymm7, [rcx+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+32], ymm0 + vmovdqu YMMWORD PTR [rax+128], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+32] + vpxor ymm11, ymm7, [rcx] + vpxor ymm12, ymm8, [rcx+96] + vpxor ymm13, ymm9, [rax+-64] + vpxor ymm14, ymm5, [r8+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [rax+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax+-96] + vpxor ymm11, ymm5, [rax] + vpxor ymm12, ymm6, [rax+96] + vpxor ymm13, ymm7, [r8+96] + vpxor ymm14, ymm8, [rcx+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-96], ymm0 + vmovdqu YMMWORD PTR [rax], ymm1 + vmovdqu YMMWORD PTR [rax+96], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-96] + vpxor ymm11, ymm8, [r8] + vpxor ymm12, ymm9, [rcx+-32] + vpxor ymm13, ymm5, [rcx+64] + vpxor ymm14, ymm6, [rax+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rax+64], ymm4 + ; Round 15 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm11, ymm1, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm12, ymm2, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm10, ymm10, [rax+-96] + vpxor ymm13, ymm13, [rax+-64] + vpxor ymm12, ymm12, [rax+-32] + vpxor ymm11, ymm11, [rax] + vpxor ymm10, ymm10, [rax+32] + vpxor ymm12, ymm12, [rax+96] + vpxor ymm11, ymm11, [rax+128] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm12, ymm12, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax+128] + vpxor ymm12, ymm7, [rcx+96] + vpxor ymm13, ymm8, [r8+96] + vpxor ymm14, ymm9, [rax+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+480] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+128], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rax+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-32] + vpxor ymm11, ymm9, [rcx+32] + vpxor ymm12, ymm5, [r8+32] + vpxor ymm13, ymm6, [rax] + vpxor ymm14, ymm7, [rcx+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+128] + vpxor ymm11, ymm7, [r8+128] + vpxor ymm12, ymm8, [rax+-64] + vpxor ymm13, ymm9, [rcx+-96] + vpxor ymm14, ymm5, [r8+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [rax+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+64] + vpxor ymm11, ymm5, [rax+32] + vpxor ymm12, ymm6, [rcx] + vpxor ymm13, ymm7, [rax+96] + vpxor ymm14, ymm8, [rcx+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [rax+32], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax+-32] + vpxor ymm11, ymm8, [rcx+-64] + vpxor ymm12, ymm9, [r8+-64] + vpxor ymm13, ymm5, [rax+-96] + vpxor ymm14, ymm6, [r8] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rax+-96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Round 16 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [rax+-64] + vpxor ymm13, ymm13, [rax] + vpxor ymm11, ymm11, [rax+32] + vpxor ymm14, ymm14, [rax+64] + vpxor ymm13, ymm13, [rax+96] + vpxor ymm11, ymm11, [rax+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm11, ymm11, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+32] + vpxor ymm12, ymm7, [rax+-64] + vpxor ymm13, ymm8, [rax+96] + vpxor ymm14, ymm9, [r8] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+512] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rax+-64], ymm2 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+96] + vpxor ymm11, ymm9, [rcx+-32] + vpxor ymm12, ymm5, [rcx+128] + vpxor ymm13, ymm6, [rax+32] + vpxor ymm14, ymm7, [r8+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [rax+32], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax+128] + vpxor ymm11, ymm7, [r8+32] + vpxor ymm12, ymm8, [rcx+-96] + vpxor ymm13, ymm9, [rcx+64] + vpxor ymm14, ymm5, [rax+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+128], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rax+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax+64] + vpxor ymm11, ymm5, [r8+-32] + vpxor ymm12, ymm6, [r8+128] + vpxor ymm13, ymm7, [rcx] + vpxor ymm14, ymm8, [rax+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+64], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rax+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+96] + vpxor ymm11, ymm8, [rax] + vpxor ymm12, ymm9, [r8+-96] + vpxor ymm13, ymm5, [r8+64] + vpxor ymm14, ymm6, [rcx+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rax], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Round 17 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm14, ymm4, [rax+-96] + vpxor ymm12, ymm12, [rax+-64] + vpxor ymm14, ymm14, [rax+-32] + vpxor ymm13, ymm13, [rax+32] + vpxor ymm10, ymm10, [rax+64] + vpxor ymm13, ymm13, [rax+96] + vpxor ymm10, ymm10, [rax+128] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm12, ymm12, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-32] + vpxor ymm12, ymm7, [rcx+-96] + vpxor ymm13, ymm8, [rcx] + vpxor ymm14, ymm9, [rcx+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+544] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax+96] + vpxor ymm11, ymm9, [r8+-64] + vpxor ymm12, ymm5, [rax+128] + vpxor ymm13, ymm6, [r8+-32] + vpxor ymm14, ymm7, [r8+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+96], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [rax+128], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+32] + vpxor ymm11, ymm7, [rcx+128] + vpxor ymm12, ymm8, [rcx+64] + vpxor ymm13, ymm9, [rax+-96] + vpxor ymm14, ymm5, [rcx+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rax+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8] + vpxor ymm11, ymm5, [r8+96] + vpxor ymm12, ymm6, [r8+32] + vpxor ymm13, ymm7, [r8+128] + vpxor ymm14, ymm8, [r8+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax+-64] + vpxor ymm11, ymm8, [rax+32] + vpxor ymm12, ymm9, [rax+-32] + vpxor ymm13, ymm5, [rax+64] + vpxor ymm14, ymm6, [rax] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-64], ymm0 + vmovdqu YMMWORD PTR [rax+32], ymm1 + vmovdqu YMMWORD PTR [rax+-32], ymm2 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rax], ymm4 + ; Round 18 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm13, ymm13, [rax+-96] + vpxor ymm10, ymm10, [rax+96] + vpxor ymm12, ymm12, [rax+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm13, ymm13, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-64] + vpxor ymm12, ymm7, [rcx+64] + vpxor ymm13, ymm8, [r8+128] + vpxor ymm14, ymm9, [rax] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+576] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rax], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx] + vpxor ymm11, ymm9, [r8+-96] + vpxor ymm12, ymm5, [rcx+32] + vpxor ymm13, ymm6, [r8+96] + vpxor ymm14, ymm7, [rax+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rax+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-32] + vpxor ymm11, ymm7, [rax+128] + vpxor ymm12, ymm8, [rax+-96] + vpxor ymm13, ymm9, [r8+64] + vpxor ymm14, ymm5, [rax+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [rax+128], ymm1 + vmovdqu YMMWORD PTR [rax+-96], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rax+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-64] + vpxor ymm11, ymm5, [rax+96] + vpxor ymm12, ymm6, [rcx+128] + vpxor ymm13, ymm7, [r8+32] + vpxor ymm14, ymm8, [rax+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [rax+96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [rax+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-96] + vpxor ymm11, ymm8, [r8+-32] + vpxor ymm12, ymm9, [rcx+96] + vpxor ymm13, ymm5, [r8] + vpxor ymm14, ymm6, [rax+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm4 + ; Round 19 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm12, ymm12, [rax+-96] + vpxor ymm14, ymm4, [rax+-64] + vpxor ymm14, ymm14, [rax+-32] + vpxor ymm14, ymm14, [rax] + vpxor ymm14, ymm14, [rax+64] + vpxor ymm11, ymm1, [rax+96] + vpxor ymm11, ymm11, [rax+128] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm13, ymm3, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm13, ymm13, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-96] + vpxor ymm12, ymm7, [rax+-96] + vpxor ymm13, ymm8, [r8+32] + vpxor ymm14, ymm9, [rax+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+608] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rax+-96], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+128] + vpxor ymm11, ymm9, [rax+-32] + vpxor ymm12, ymm5, [rcx+-32] + vpxor ymm13, ymm6, [rax+96] + vpxor ymm14, ymm7, [rcx+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rax+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-64] + vpxor ymm11, ymm7, [rcx+32] + vpxor ymm12, ymm8, [r8+64] + vpxor ymm13, ymm9, [rax+64] + vpxor ymm14, ymm5, [rcx+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax] + vpxor ymm11, ymm5, [rcx] + vpxor ymm12, ymm6, [rax+128] + vpxor ymm13, ymm7, [rcx+128] + vpxor ymm14, ymm8, [r8] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rax+128], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+64] + vpxor ymm11, ymm8, [r8+96] + vpxor ymm12, ymm9, [rax+-64] + vpxor ymm13, ymm5, [rcx+-64] + vpxor ymm14, ymm6, [r8+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rax+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Round 20 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm13, ymm3, [rcx+128] + vpxor ymm12, ymm12, [rax+-96] + vpxor ymm11, ymm11, [rax+-32] + vpxor ymm10, ymm10, [rax] + vpxor ymm14, ymm14, [rax+32] + vpxor ymm13, ymm13, [rax+64] + vpxor ymm13, ymm13, [rax+96] + vpxor ymm12, ymm12, [rax+128] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm14, ymm14, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm10, ymm10, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax+-32] + vpxor ymm12, ymm7, [r8+64] + vpxor ymm13, ymm8, [rcx+128] + vpxor ymm14, ymm9, [r8+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+640] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+32] + vpxor ymm11, ymm9, [rcx+96] + vpxor ymm12, ymm5, [r8+-64] + vpxor ymm13, ymm6, [rcx] + vpxor ymm14, ymm7, [rax+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rax+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-96] + vpxor ymm11, ymm7, [rcx+-32] + vpxor ymm12, ymm8, [rax+64] + vpxor ymm13, ymm9, [r8] + vpxor ymm14, ymm5, [rcx+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rax+64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax+32] + vpxor ymm11, ymm5, [r8+128] + vpxor ymm12, ymm6, [rcx+32] + vpxor ymm13, ymm7, [rax+128] + vpxor ymm14, ymm8, [rcx+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+32], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rax+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax+-96] + vpxor ymm11, ymm8, [rax+96] + vpxor ymm12, ymm9, [rcx+-96] + vpxor ymm13, ymm5, [rax] + vpxor ymm14, ymm6, [r8+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-96], ymm0 + vmovdqu YMMWORD PTR [rax+96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Round 21 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm14, ymm14, [rax+-64] + vpxor ymm11, ymm11, [rax+-32] + vpxor ymm10, ymm10, [rax+32] + vpxor ymm12, ymm12, [rax+64] + vpxor ymm13, ymm13, [rax+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm11, ymm11, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+96] + vpxor ymm12, ymm7, [rax+64] + vpxor ymm13, ymm8, [rax+128] + vpxor ymm14, ymm9, [r8+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+672] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [rax+64], ymm2 + vmovdqu YMMWORD PTR [rax+128], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+128] + vpxor ymm11, ymm9, [rax+-64] + vpxor ymm12, ymm5, [r8+-96] + vpxor ymm13, ymm6, [r8+128] + vpxor ymm14, ymm7, [rcx+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rax+-64], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax+-32] + vpxor ymm11, ymm7, [r8+-64] + vpxor ymm12, ymm8, [r8] + vpxor ymm13, ymm9, [rcx+-64] + vpxor ymm14, ymm5, [rax+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rax+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-32] + vpxor ymm11, ymm5, [r8+32] + vpxor ymm12, ymm6, [rcx+-32] + vpxor ymm13, ymm7, [rcx+32] + vpxor ymm14, ymm8, [rax] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [rax], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+64] + vpxor ymm11, ymm8, [rcx] + vpxor ymm12, ymm9, [rcx+64] + vpxor ymm13, ymm5, [rax+32] + vpxor ymm14, ymm6, [rax+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rax+32], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm4 + ; Round 22 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm1, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm14, ymm14, [rax+-96] + vpxor ymm11, ymm11, [rax+-64] + vpxor ymm10, ymm10, [rax+-32] + vpxor ymm14, ymm14, [rax] + vpxor ymm12, ymm12, [rax+64] + vpxor ymm13, ymm13, [rax+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm13, ymm13, [r8+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rax+-64] + vpxor ymm12, ymm7, [r8] + vpxor ymm13, ymm8, [rcx+32] + vpxor ymm14, ymm9, [rax+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+704] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rax+128] + vpxor ymm11, ymm9, [rcx+-96] + vpxor ymm12, ymm5, [rax+-32] + vpxor ymm13, ymm6, [r8+32] + vpxor ymm14, ymm7, [rcx+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [rax+-32], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+96] + vpxor ymm11, ymm7, [r8+-96] + vpxor ymm12, ymm8, [rcx+-64] + vpxor ymm13, ymm9, [rax] + vpxor ymm14, ymm5, [r8+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+96] + vpxor ymm11, ymm5, [rcx+128] + vpxor ymm12, ymm6, [r8+-64] + vpxor ymm13, ymm7, [rcx+-32] + vpxor ymm14, ymm8, [rax+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rax+64] + vpxor ymm11, ymm8, [r8+128] + vpxor ymm12, ymm9, [rax+-96] + vpxor ymm13, ymm5, [r8+-32] + vpxor ymm14, ymm6, [rcx] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+64], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [rax+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Round 23 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm14, ymm4, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm11, ymm11, [rax+-64] + vpxor ymm12, ymm12, [rax+-32] + vpxor ymm13, ymm13, [rax] + vpxor ymm14, ymm14, [rax+32] + vpxor ymm14, ymm14, [rax+96] + vpxor ymm10, ymm10, [rax+128] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm12, ymm12, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm10, ymm10, [r8+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-96] + vpxor ymm12, ymm7, [rcx+-64] + vpxor ymm13, ymm8, [rcx+-32] + vpxor ymm14, ymm9, [rcx] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rdx+736] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+32] + vpxor ymm11, ymm9, [rcx+64] + vpxor ymm12, ymm5, [rcx+96] + vpxor ymm13, ymm6, [rcx+128] + vpxor ymm14, ymm7, [rax+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [rax+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rax+-64] + vpxor ymm11, ymm7, [rax+-32] + vpxor ymm12, ymm8, [rax] + vpxor ymm13, ymm9, [rax+32] + vpxor ymm14, ymm5, [rax+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+-64], ymm0 + vmovdqu YMMWORD PTR [rax+-32], ymm1 + vmovdqu YMMWORD PTR [rax], ymm2 + vmovdqu YMMWORD PTR [rax+32], ymm3 + vmovdqu YMMWORD PTR [rax+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rax+96] + vpxor ymm11, ymm5, [rax+128] + vpxor ymm12, ymm6, [r8+-96] + vpxor ymm13, ymm7, [r8+-64] + vpxor ymm14, ymm8, [r8+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rax+96], ymm0 + vmovdqu YMMWORD PTR [rax+128], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8] + vpxor ymm11, ymm8, [r8+32] + vpxor ymm12, ymm9, [r8+64] + vpxor ymm13, ymm5, [r8+96] + vpxor ymm14, ymm6, [r8+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + sub rcx, 128 + vmovdqu YMMWORD PTR [rcx], ymm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +sha3_blocksx4_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_sha3_128_blockx4_seed_avx2_end_mark QWORD 8000000000000000h, 8000000000000000h + QWORD 8000000000000000h, 8000000000000000h +ptr_L_sha3_128_blockx4_seed_avx2_end_mark QWORD L_sha3_128_blockx4_seed_avx2_end_mark +_DATA ENDS +_TEXT SEGMENT READONLY PARA +sha3_128_blocksx4_seed_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + mov rax, QWORD PTR [ptr_L_sha3_x4_avx2_r] + mov r8, rcx + mov r9, rcx + vpbroadcastq ymm15, QWORD PTR [rdx] + add rcx, 128 + vpbroadcastq ymm11, QWORD PTR [rdx+8] + add r8, 384 + vpbroadcastq ymm12, QWORD PTR [rdx+16] + add r9, 640 + vpbroadcastq ymm13, QWORD PTR [rdx+24] + vmovdqu ymm5, YMMWORD PTR L_sha3_128_blockx4_seed_avx2_end_mark + vpxor ymm6, ymm6, ymm6 + vmovdqu YMMWORD PTR [rcx+-96], ymm11 + vmovdqu YMMWORD PTR [rcx+-64], ymm12 + vmovdqu YMMWORD PTR [rcx+-32], ymm13 + vmovdqu ymm14, YMMWORD PTR [rcx] + vmovdqu YMMWORD PTR [rcx+32], ymm6 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm6 + vmovdqu YMMWORD PTR [rcx+128], ymm6 + vmovdqu YMMWORD PTR [r8+-96], ymm6 + vmovdqu YMMWORD PTR [r8+-64], ymm6 + vmovdqu YMMWORD PTR [r8+-32], ymm6 + vmovdqu YMMWORD PTR [r8], ymm6 + vmovdqu YMMWORD PTR [r8+32], ymm6 + vmovdqu YMMWORD PTR [r8+64], ymm6 + vmovdqu YMMWORD PTR [r8+96], ymm6 + vmovdqu YMMWORD PTR [r8+128], ymm6 + vmovdqu YMMWORD PTR [r9+-96], ymm6 + vmovdqu YMMWORD PTR [r9+-64], ymm6 + vmovdqu YMMWORD PTR [r9+-32], ymm6 + vmovdqu YMMWORD PTR [r9], ymm5 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [r9+64], ymm6 + vmovdqu YMMWORD PTR [r9+96], ymm6 + vmovdqu YMMWORD PTR [r9+128], ymm6 + vpxor ymm10, ymm15, ymm5 + ; Round 0 + ; Calc b[0..4] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+64] + vpxor ymm12, ymm7, [r8] + vpxor ymm13, ymm8, [r9+-64] + vpxor ymm14, ymm9, [r9+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-32] + vpxor ymm11, ymm9, [r8+-96] + vpxor ymm12, ymm5, [r8+-64] + vpxor ymm13, ymm6, [r8+128] + vpxor ymm14, ymm7, [r9+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-96] + vpxor ymm11, ymm7, [rcx+96] + vpxor ymm12, ymm8, [r8+32] + vpxor ymm13, ymm9, [r9+-32] + vpxor ymm14, ymm5, [r9] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx] + vpxor ymm11, ymm5, [rcx+32] + vpxor ymm12, ymm6, [r8+-32] + vpxor ymm13, ymm7, [r9+-96] + vpxor ymm14, ymm8, [r9+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-64] + vpxor ymm11, ymm8, [rcx+128] + vpxor ymm12, ymm9, [r8+64] + vpxor ymm13, ymm5, [r8+96] + vpxor ymm14, ymm6, [r9+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Round 1 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm12, ymm2, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm13, ymm3, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm14, ymm4, [r9] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-96] + vpxor ymm12, ymm7, [r8+32] + vpxor ymm13, ymm8, [r9+-96] + vpxor ymm14, ymm9, [r9+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+32] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-64] + vpxor ymm11, ymm9, [r9+64] + vpxor ymm12, ymm5, [rcx+-96] + vpxor ymm13, ymm6, [rcx+32] + vpxor ymm14, ymm7, [r8+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+64] + vpxor ymm11, ymm7, [r8+-64] + vpxor ymm12, ymm8, [r9+-32] + vpxor ymm13, ymm9, [r9+96] + vpxor ymm14, ymm5, [rcx+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+128] + vpxor ymm11, ymm5, [rcx+-32] + vpxor ymm12, ymm6, [rcx+96] + vpxor ymm13, ymm7, [r8+-32] + vpxor ymm14, ymm8, [r8+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8] + vpxor ymm11, ymm8, [r8+128] + vpxor ymm12, ymm9, [r9] + vpxor ymm13, ymm5, [rcx] + vpxor ymm14, ymm6, [rcx+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Round 2 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+64] + vpxor ymm12, ymm7, [r9+-32] + vpxor ymm13, ymm8, [r8+-32] + vpxor ymm14, ymm9, [rcx+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+64] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-96] + vpxor ymm11, ymm9, [r8+64] + vpxor ymm12, ymm5, [rcx+64] + vpxor ymm13, ymm6, [rcx+-32] + vpxor ymm14, ymm7, [r9] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-96] + vpxor ymm11, ymm7, [rcx+-96] + vpxor ymm12, ymm8, [r9+96] + vpxor ymm13, ymm9, [r8+96] + vpxor ymm14, ymm5, [r8] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+32] + vpxor ymm11, ymm5, [r9+-64] + vpxor ymm12, ymm6, [r8+-64] + vpxor ymm13, ymm7, [rcx+96] + vpxor ymm14, ymm8, [rcx] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+32] + vpxor ymm11, ymm8, [rcx+32] + vpxor ymm12, ymm9, [rcx+-64] + vpxor ymm13, ymm5, [r9+128] + vpxor ymm14, ymm6, [r8+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Round 3 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm2, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm14, ymm14, [r9] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm12, ymm12, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+64] + vpxor ymm12, ymm7, [r9+96] + vpxor ymm13, ymm8, [rcx+96] + vpxor ymm14, ymm9, [r8+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+96] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-32] + vpxor ymm11, ymm9, [r9] + vpxor ymm12, ymm5, [r8+-96] + vpxor ymm13, ymm6, [r9+-64] + vpxor ymm14, ymm7, [rcx+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+64] + vpxor ymm11, ymm7, [rcx+64] + vpxor ymm12, ymm8, [r8+96] + vpxor ymm13, ymm9, [rcx] + vpxor ymm14, ymm5, [r8+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+128] + vpxor ymm11, ymm5, [r9+-96] + vpxor ymm12, ymm6, [rcx+-96] + vpxor ymm13, ymm7, [r8+-64] + vpxor ymm14, ymm8, [r9+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-32] + vpxor ymm11, ymm8, [rcx+-32] + vpxor ymm12, ymm9, [r8] + vpxor ymm13, ymm5, [r9+32] + vpxor ymm14, ymm6, [rcx+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Round 4 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm1, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm11, ymm11, [r9] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9] + vpxor ymm12, ymm7, [r8+96] + vpxor ymm13, ymm8, [r8+-64] + vpxor ymm14, ymm9, [rcx+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+128] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+96] + vpxor ymm11, ymm9, [rcx+-64] + vpxor ymm12, ymm5, [r9+64] + vpxor ymm13, ymm6, [r9+-96] + vpxor ymm14, ymm7, [r8] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+64] + vpxor ymm11, ymm7, [r8+-96] + vpxor ymm12, ymm8, [rcx] + vpxor ymm13, ymm9, [r9+128] + vpxor ymm14, ymm5, [r9+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+128] + vpxor ymm11, ymm5, [r8+-32] + vpxor ymm12, ymm6, [rcx+64] + vpxor ymm13, ymm7, [rcx+-96] + vpxor ymm14, ymm8, [r9+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+96] + vpxor ymm11, ymm8, [r9+-64] + vpxor ymm12, ymm9, [r8+32] + vpxor ymm13, ymm5, [rcx+128] + vpxor ymm14, ymm6, [rcx+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Round 5 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm12, ymm2, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm11, ymm11, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-64] + vpxor ymm12, ymm7, [rcx] + vpxor ymm13, ymm8, [rcx+-96] + vpxor ymm14, ymm9, [rcx+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+160] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-64] + vpxor ymm11, ymm9, [r8] + vpxor ymm12, ymm5, [r8+64] + vpxor ymm13, ymm6, [r8+-32] + vpxor ymm14, ymm7, [r8+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9] + vpxor ymm11, ymm7, [r9+64] + vpxor ymm12, ymm8, [r9+128] + vpxor ymm13, ymm9, [r9+32] + vpxor ymm14, ymm5, [r9+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+32] + vpxor ymm11, ymm5, [rcx+96] + vpxor ymm12, ymm6, [r8+-96] + vpxor ymm13, ymm7, [rcx+64] + vpxor ymm14, ymm8, [rcx+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+96] + vpxor ymm11, ymm8, [r9+-96] + vpxor ymm12, ymm9, [r9+-32] + vpxor ymm13, ymm5, [r8+128] + vpxor ymm14, ymm6, [r9+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Round 6 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm10, ymm10, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8] + vpxor ymm12, ymm7, [r9+128] + vpxor ymm13, ymm8, [rcx+64] + vpxor ymm14, ymm9, [r9+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+192] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-96] + vpxor ymm11, ymm9, [r8+32] + vpxor ymm12, ymm5, [r9] + vpxor ymm13, ymm6, [rcx+96] + vpxor ymm14, ymm7, [r9+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-64] + vpxor ymm11, ymm7, [r8+64] + vpxor ymm12, ymm8, [r9+32] + vpxor ymm13, ymm9, [rcx+128] + vpxor ymm14, ymm5, [r8+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-32] + vpxor ymm11, ymm5, [r8+-64] + vpxor ymm12, ymm6, [r9+64] + vpxor ymm13, ymm7, [r8+-96] + vpxor ymm14, ymm8, [r8+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx] + vpxor ymm11, ymm8, [r8+-32] + vpxor ymm12, ymm9, [r9+96] + vpxor ymm13, ymm5, [rcx+32] + vpxor ymm14, ymm6, [r9+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Round 7 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm13, ymm3, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm11, ymm1, [r8+-64] + vpxor ymm11, ymm11, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm14, ymm4, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm12, ymm2, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+32] + vpxor ymm12, ymm7, [r9+32] + vpxor ymm13, ymm8, [r8+-96] + vpxor ymm14, ymm9, [r9+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+224] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+64] + vpxor ymm11, ymm9, [r9+-32] + vpxor ymm12, ymm5, [rcx+-64] + vpxor ymm13, ymm6, [r8+-64] + vpxor ymm14, ymm7, [r9+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8] + vpxor ymm11, ymm7, [r9] + vpxor ymm12, ymm8, [rcx+128] + vpxor ymm13, ymm9, [r8+128] + vpxor ymm14, ymm5, [rcx] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-64] + vpxor ymm11, ymm5, [rcx+-96] + vpxor ymm12, ymm6, [r8+64] + vpxor ymm13, ymm7, [r9+64] + vpxor ymm14, ymm8, [rcx+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+128] + vpxor ymm11, ymm8, [rcx+96] + vpxor ymm12, ymm9, [r8+96] + vpxor ymm13, ymm5, [rcx+-32] + vpxor ymm14, ymm6, [r8+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Round 8 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm14, ymm4, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm13, ymm3, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm10, ymm10, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm11, ymm11, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm14, ymm14, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-32] + vpxor ymm12, ymm7, [rcx+128] + vpxor ymm13, ymm8, [r9+64] + vpxor ymm14, ymm9, [r8+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+256] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-96] + vpxor ymm11, ymm9, [r9+96] + vpxor ymm12, ymm5, [r8] + vpxor ymm13, ymm6, [rcx+-96] + vpxor ymm14, ymm7, [r8+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+32] + vpxor ymm11, ymm7, [rcx+-64] + vpxor ymm12, ymm8, [r8+128] + vpxor ymm13, ymm9, [rcx+32] + vpxor ymm14, ymm5, [r9+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-96] + vpxor ymm11, ymm5, [rcx+64] + vpxor ymm12, ymm6, [r9] + vpxor ymm13, ymm7, [r8+64] + vpxor ymm14, ymm8, [rcx+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+32] + vpxor ymm11, ymm8, [r8+-64] + vpxor ymm12, ymm9, [rcx] + vpxor ymm13, ymm5, [r9+-64] + vpxor ymm14, ymm6, [rcx+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Round 9 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm12, ymm2, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm12, ymm12, [r9] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+96] + vpxor ymm12, ymm7, [r8+128] + vpxor ymm13, ymm8, [r8+64] + vpxor ymm14, ymm9, [rcx+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+288] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+64] + vpxor ymm11, ymm9, [r8+96] + vpxor ymm12, ymm5, [r8+32] + vpxor ymm13, ymm6, [rcx+64] + vpxor ymm14, ymm7, [rcx] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-32] + vpxor ymm11, ymm7, [r8] + vpxor ymm12, ymm8, [rcx+32] + vpxor ymm13, ymm9, [rcx+-32] + vpxor ymm14, ymm5, [r9+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-32] + vpxor ymm11, ymm5, [r8+-96] + vpxor ymm12, ymm6, [rcx+-64] + vpxor ymm13, ymm7, [r9] + vpxor ymm14, ymm8, [r9+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+128] + vpxor ymm11, ymm8, [rcx+-96] + vpxor ymm12, ymm9, [r9+128] + vpxor ymm13, ymm5, [r9+-96] + vpxor ymm14, ymm6, [r8+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Round 10 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm1, [r8+-96] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm11, ymm11, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+96] + vpxor ymm12, ymm7, [rcx+32] + vpxor ymm13, ymm8, [r9] + vpxor ymm14, ymm9, [r8+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+320] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+64] + vpxor ymm11, ymm9, [rcx] + vpxor ymm12, ymm5, [r9+-32] + vpxor ymm13, ymm6, [r8+-96] + vpxor ymm14, ymm7, [r9+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+96] + vpxor ymm11, ymm7, [r8+32] + vpxor ymm12, ymm8, [rcx+-32] + vpxor ymm13, ymm9, [r9+-64] + vpxor ymm14, ymm5, [rcx+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+96] + vpxor ymm11, ymm5, [r9+64] + vpxor ymm12, ymm6, [r8] + vpxor ymm13, ymm7, [rcx+-64] + vpxor ymm14, ymm8, [r9+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+128] + vpxor ymm11, ymm8, [rcx+64] + vpxor ymm12, ymm9, [r9+32] + vpxor ymm13, ymm5, [r8+-32] + vpxor ymm14, ymm6, [rcx+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Round 11 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm14, ymm4, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm12, ymm12, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm10, ymm10, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx] + vpxor ymm12, ymm7, [rcx+-32] + vpxor ymm13, ymm8, [rcx+-64] + vpxor ymm14, ymm9, [rcx+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+352] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9] + vpxor ymm11, ymm9, [r9+128] + vpxor ymm12, ymm5, [r9+96] + vpxor ymm13, ymm6, [r9+64] + vpxor ymm14, ymm7, [r9+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+96] + vpxor ymm11, ymm7, [r9+-32] + vpxor ymm12, ymm8, [r9+-64] + vpxor ymm13, ymm9, [r9+-96] + vpxor ymm14, ymm5, [r8+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-64] + vpxor ymm11, ymm5, [r8+64] + vpxor ymm12, ymm6, [r8+32] + vpxor ymm13, ymm7, [r8] + vpxor ymm14, ymm8, [r8+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+32] + vpxor ymm11, ymm8, [r8+-96] + vpxor ymm12, ymm9, [rcx+128] + vpxor ymm13, ymm5, [rcx+96] + vpxor ymm14, ymm6, [rcx+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Round 12 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm10, ymm10, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+128] + vpxor ymm12, ymm7, [r9+-64] + vpxor ymm13, ymm8, [r8] + vpxor ymm14, ymm9, [rcx+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+384] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-64] + vpxor ymm11, ymm9, [r9+32] + vpxor ymm12, ymm5, [r8+96] + vpxor ymm13, ymm6, [r8+64] + vpxor ymm14, ymm7, [rcx+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx] + vpxor ymm11, ymm7, [r9+96] + vpxor ymm12, ymm8, [r9+-96] + vpxor ymm13, ymm9, [r8+-32] + vpxor ymm14, ymm5, [rcx+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-96] + vpxor ymm11, ymm5, [r9] + vpxor ymm12, ymm6, [r9+-32] + vpxor ymm13, ymm7, [r8+32] + vpxor ymm14, ymm8, [rcx+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-32] + vpxor ymm11, ymm8, [r9+64] + vpxor ymm12, ymm9, [r8+128] + vpxor ymm13, ymm5, [r8+-64] + vpxor ymm14, ymm6, [r8+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Round 13 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm13, ymm3, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm12, ymm2, [r8+96] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm11, ymm1, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+32] + vpxor ymm12, ymm7, [r9+-96] + vpxor ymm13, ymm8, [r8+32] + vpxor ymm14, ymm9, [r8+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+416] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8] + vpxor ymm11, ymm9, [rcx+128] + vpxor ymm12, ymm5, [rcx] + vpxor ymm13, ymm6, [r9] + vpxor ymm14, ymm7, [r8+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+128] + vpxor ymm11, ymm7, [r8+96] + vpxor ymm12, ymm8, [r8+-32] + vpxor ymm13, ymm9, [rcx+96] + vpxor ymm14, ymm5, [rcx+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+64] + vpxor ymm11, ymm5, [rcx+-64] + vpxor ymm12, ymm6, [r9+96] + vpxor ymm13, ymm7, [r9+-32] + vpxor ymm14, ymm8, [r8+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-64] + vpxor ymm11, ymm8, [r8+64] + vpxor ymm12, ymm9, [rcx+32] + vpxor ymm13, ymm5, [rcx+-96] + vpxor ymm14, ymm6, [r9+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Round 14 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm13, ymm3, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+128] + vpxor ymm12, ymm7, [r8+-32] + vpxor ymm13, ymm8, [r9+-32] + vpxor ymm14, ymm9, [r9+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+448] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+32] + vpxor ymm11, ymm9, [r8+128] + vpxor ymm12, ymm5, [r9+128] + vpxor ymm13, ymm6, [rcx+-64] + vpxor ymm14, ymm7, [rcx+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+32] + vpxor ymm11, ymm7, [rcx] + vpxor ymm12, ymm8, [rcx+96] + vpxor ymm13, ymm9, [r8+-64] + vpxor ymm14, ymm5, [r9+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-96] + vpxor ymm11, ymm5, [r8] + vpxor ymm12, ymm6, [r8+96] + vpxor ymm13, ymm7, [r9+96] + vpxor ymm14, ymm8, [rcx+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-96] + vpxor ymm11, ymm8, [r9] + vpxor ymm12, ymm9, [rcx+-32] + vpxor ymm13, ymm5, [rcx+64] + vpxor ymm14, ymm6, [r8+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Round 15 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm11, ymm1, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm12, ymm2, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+128] + vpxor ymm12, ymm7, [rcx+96] + vpxor ymm13, ymm8, [r9+96] + vpxor ymm14, ymm9, [r8+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+480] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-32] + vpxor ymm11, ymm9, [rcx+32] + vpxor ymm12, ymm5, [r9+32] + vpxor ymm13, ymm6, [r8] + vpxor ymm14, ymm7, [rcx+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+128] + vpxor ymm11, ymm7, [r9+128] + vpxor ymm12, ymm8, [r8+-64] + vpxor ymm13, ymm9, [rcx+-96] + vpxor ymm14, ymm5, [r9+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+64] + vpxor ymm11, ymm5, [r8+32] + vpxor ymm12, ymm6, [rcx] + vpxor ymm13, ymm7, [r8+96] + vpxor ymm14, ymm8, [rcx+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-32] + vpxor ymm11, ymm8, [rcx+-64] + vpxor ymm12, ymm9, [r9+-64] + vpxor ymm13, ymm5, [r8+-96] + vpxor ymm14, ymm6, [r9] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Round 16 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm13, ymm13, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+32] + vpxor ymm12, ymm7, [r8+-64] + vpxor ymm13, ymm8, [r8+96] + vpxor ymm14, ymm9, [r9] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+512] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+96] + vpxor ymm11, ymm9, [rcx+-32] + vpxor ymm12, ymm5, [rcx+128] + vpxor ymm13, ymm6, [r8+32] + vpxor ymm14, ymm7, [r9+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+128] + vpxor ymm11, ymm7, [r9+32] + vpxor ymm12, ymm8, [rcx+-96] + vpxor ymm13, ymm9, [rcx+64] + vpxor ymm14, ymm5, [r8+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+64] + vpxor ymm11, ymm5, [r9+-32] + vpxor ymm12, ymm6, [r9+128] + vpxor ymm13, ymm7, [rcx] + vpxor ymm14, ymm8, [r8+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+96] + vpxor ymm11, ymm8, [r8] + vpxor ymm12, ymm9, [r9+-96] + vpxor ymm13, ymm5, [r9+64] + vpxor ymm14, ymm6, [rcx+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Round 17 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm14, ymm4, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm14, ymm14, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm10, ymm10, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-32] + vpxor ymm12, ymm7, [rcx+-96] + vpxor ymm13, ymm8, [rcx] + vpxor ymm14, ymm9, [rcx+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+544] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+96] + vpxor ymm11, ymm9, [r9+-64] + vpxor ymm12, ymm5, [r8+128] + vpxor ymm13, ymm6, [r9+-32] + vpxor ymm14, ymm7, [r9+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+32] + vpxor ymm11, ymm7, [rcx+128] + vpxor ymm12, ymm8, [rcx+64] + vpxor ymm13, ymm9, [r8+-96] + vpxor ymm14, ymm5, [rcx+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9] + vpxor ymm11, ymm5, [r9+96] + vpxor ymm12, ymm6, [r9+32] + vpxor ymm13, ymm7, [r9+128] + vpxor ymm14, ymm8, [r9+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-64] + vpxor ymm11, ymm8, [r8+32] + vpxor ymm12, ymm9, [r8+-32] + vpxor ymm13, ymm5, [r8+64] + vpxor ymm14, ymm6, [r8] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Round 18 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm10, ymm10, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-64] + vpxor ymm12, ymm7, [rcx+64] + vpxor ymm13, ymm8, [r9+128] + vpxor ymm14, ymm9, [r8] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+576] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx] + vpxor ymm11, ymm9, [r9+-96] + vpxor ymm12, ymm5, [rcx+32] + vpxor ymm13, ymm6, [r9+96] + vpxor ymm14, ymm7, [r8+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-32] + vpxor ymm11, ymm7, [r8+128] + vpxor ymm12, ymm8, [r8+-96] + vpxor ymm13, ymm9, [r9+64] + vpxor ymm14, ymm5, [r8+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-64] + vpxor ymm11, ymm5, [r8+96] + vpxor ymm12, ymm6, [rcx+128] + vpxor ymm13, ymm7, [r9+32] + vpxor ymm14, ymm8, [r8+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-96] + vpxor ymm11, ymm8, [r9+-32] + vpxor ymm12, ymm9, [rcx+96] + vpxor ymm13, ymm5, [r9] + vpxor ymm14, ymm6, [r8+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Round 19 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm14, ymm4, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm11, ymm1, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm13, ymm3, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-96] + vpxor ymm12, ymm7, [r8+-96] + vpxor ymm13, ymm8, [r9+32] + vpxor ymm14, ymm9, [r8+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+608] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+128] + vpxor ymm11, ymm9, [r8+-32] + vpxor ymm12, ymm5, [rcx+-32] + vpxor ymm13, ymm6, [r8+96] + vpxor ymm14, ymm7, [rcx+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-64] + vpxor ymm11, ymm7, [rcx+32] + vpxor ymm12, ymm8, [r9+64] + vpxor ymm13, ymm9, [r8+64] + vpxor ymm14, ymm5, [rcx+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8] + vpxor ymm11, ymm5, [rcx] + vpxor ymm12, ymm6, [r8+128] + vpxor ymm13, ymm7, [rcx+128] + vpxor ymm14, ymm8, [r9] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+64] + vpxor ymm11, ymm8, [r9+96] + vpxor ymm12, ymm9, [r8+-64] + vpxor ymm13, ymm5, [rcx+-64] + vpxor ymm14, ymm6, [r9+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Round 20 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm13, ymm3, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm14, ymm14, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-32] + vpxor ymm12, ymm7, [r9+64] + vpxor ymm13, ymm8, [rcx+128] + vpxor ymm14, ymm9, [r9+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+640] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+32] + vpxor ymm11, ymm9, [rcx+96] + vpxor ymm12, ymm5, [r9+-64] + vpxor ymm13, ymm6, [rcx] + vpxor ymm14, ymm7, [r8+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-96] + vpxor ymm11, ymm7, [rcx+-32] + vpxor ymm12, ymm8, [r8+64] + vpxor ymm13, ymm9, [r9] + vpxor ymm14, ymm5, [rcx+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+32] + vpxor ymm11, ymm5, [r9+128] + vpxor ymm12, ymm6, [rcx+32] + vpxor ymm13, ymm7, [r8+128] + vpxor ymm14, ymm8, [rcx+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-96] + vpxor ymm11, ymm8, [r8+96] + vpxor ymm12, ymm9, [rcx+-96] + vpxor ymm13, ymm5, [r8] + vpxor ymm14, ymm6, [r9+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Round 21 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+96] + vpxor ymm12, ymm7, [r8+64] + vpxor ymm13, ymm8, [r8+128] + vpxor ymm14, ymm9, [r9+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+672] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+128] + vpxor ymm11, ymm9, [r8+-64] + vpxor ymm12, ymm5, [r9+-96] + vpxor ymm13, ymm6, [r9+128] + vpxor ymm14, ymm7, [rcx+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-32] + vpxor ymm11, ymm7, [r9+-64] + vpxor ymm12, ymm8, [r9] + vpxor ymm13, ymm9, [rcx+-64] + vpxor ymm14, ymm5, [r8+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-32] + vpxor ymm11, ymm5, [r9+32] + vpxor ymm12, ymm6, [rcx+-32] + vpxor ymm13, ymm7, [rcx+32] + vpxor ymm14, ymm8, [r8] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+64] + vpxor ymm11, ymm8, [rcx] + vpxor ymm12, ymm9, [rcx+64] + vpxor ymm13, ymm5, [r8+32] + vpxor ymm14, ymm6, [r8+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Round 22 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm1, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm12, ymm12, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-64] + vpxor ymm12, ymm7, [r9] + vpxor ymm13, ymm8, [rcx+32] + vpxor ymm14, ymm9, [r8+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+704] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+128] + vpxor ymm11, ymm9, [rcx+-96] + vpxor ymm12, ymm5, [r8+-32] + vpxor ymm13, ymm6, [r9+32] + vpxor ymm14, ymm7, [rcx+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+96] + vpxor ymm11, ymm7, [r9+-96] + vpxor ymm12, ymm8, [rcx+-64] + vpxor ymm13, ymm9, [r8] + vpxor ymm14, ymm5, [r9+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+96] + vpxor ymm11, ymm5, [rcx+128] + vpxor ymm12, ymm6, [r9+-64] + vpxor ymm13, ymm7, [rcx+-32] + vpxor ymm14, ymm8, [r8+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+64] + vpxor ymm11, ymm8, [r9+128] + vpxor ymm12, ymm9, [r8+-96] + vpxor ymm13, ymm5, [r9+-32] + vpxor ymm14, ymm6, [rcx] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Round 23 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm14, ymm4, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm12, ymm12, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm10, ymm10, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-96] + vpxor ymm12, ymm7, [rcx+-64] + vpxor ymm13, ymm8, [rcx+-32] + vpxor ymm14, ymm9, [rcx] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+736] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+32] + vpxor ymm11, ymm9, [rcx+64] + vpxor ymm12, ymm5, [rcx+96] + vpxor ymm13, ymm6, [rcx+128] + vpxor ymm14, ymm7, [r8+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-64] + vpxor ymm11, ymm7, [r8+-32] + vpxor ymm12, ymm8, [r8] + vpxor ymm13, ymm9, [r8+32] + vpxor ymm14, ymm5, [r8+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+96] + vpxor ymm11, ymm5, [r8+128] + vpxor ymm12, ymm6, [r9+-96] + vpxor ymm13, ymm7, [r9+-64] + vpxor ymm14, ymm8, [r9+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9] + vpxor ymm11, ymm8, [r9+32] + vpxor ymm12, ymm9, [r9+64] + vpxor ymm13, ymm5, [r9+96] + vpxor ymm14, ymm6, [r9+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + sub rcx, 128 + vmovdqu YMMWORD PTR [rcx], ymm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +sha3_128_blocksx4_seed_avx2 ENDP +_TEXT ENDS +ENDIF +IFDEF WOLFSSL_HAVE_MLKEM +_DATA SEGMENT +ALIGN 16 +L_sha3_256_blockx4_seed_avx2_end_mark QWORD 8000000000000000h, 8000000000000000h + QWORD 8000000000000000h, 8000000000000000h +ptr_L_sha3_256_blockx4_seed_avx2_end_mark QWORD L_sha3_256_blockx4_seed_avx2_end_mark +_DATA ENDS +_TEXT SEGMENT READONLY PARA +sha3_256_blocksx4_seed_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + mov rax, QWORD PTR [ptr_L_sha3_x4_avx2_r] + mov r8, rcx + mov r9, rcx + vpbroadcastq ymm15, QWORD PTR [rdx] + add rcx, 128 + vpbroadcastq ymm11, QWORD PTR [rdx+8] + add r8, 384 + vpbroadcastq ymm12, QWORD PTR [rdx+16] + add r9, 640 + vpbroadcastq ymm13, QWORD PTR [rdx+24] + vmovdqu ymm5, YMMWORD PTR L_sha3_256_blockx4_seed_avx2_end_mark + vpxor ymm6, ymm6, ymm6 + vmovdqu YMMWORD PTR [rcx+-96], ymm11 + vmovdqu YMMWORD PTR [rcx+-64], ymm12 + vmovdqu YMMWORD PTR [rcx+-32], ymm13 + vmovdqu ymm14, YMMWORD PTR [rcx] + vmovdqu YMMWORD PTR [rcx+32], ymm6 + vmovdqu YMMWORD PTR [rcx+64], ymm6 + vmovdqu YMMWORD PTR [rcx+96], ymm6 + vmovdqu YMMWORD PTR [rcx+128], ymm6 + vmovdqu YMMWORD PTR [r8+-96], ymm6 + vmovdqu YMMWORD PTR [r8+-64], ymm6 + vmovdqu YMMWORD PTR [r8+-32], ymm6 + vmovdqu YMMWORD PTR [r8], ymm6 + vmovdqu YMMWORD PTR [r8+32], ymm6 + vmovdqu YMMWORD PTR [r8+64], ymm6 + vmovdqu YMMWORD PTR [r8+96], ymm6 + vmovdqu YMMWORD PTR [r8+128], ymm5 + vmovdqu YMMWORD PTR [r9+-96], ymm6 + vmovdqu YMMWORD PTR [r9+-64], ymm6 + vmovdqu YMMWORD PTR [r9+-32], ymm6 + vmovdqu YMMWORD PTR [r9], ymm6 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [r9+64], ymm6 + vmovdqu YMMWORD PTR [r9+96], ymm6 + vmovdqu YMMWORD PTR [r9+128], ymm6 + vmovdqu ymm10, ymm15 + vpxor ymm11, ymm11, ymm5 + ; Round 0 + ; Calc b[0..4] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+64] + vpxor ymm12, ymm7, [r8] + vpxor ymm13, ymm8, [r9+-64] + vpxor ymm14, ymm9, [r9+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-32] + vpxor ymm11, ymm9, [r8+-96] + vpxor ymm12, ymm5, [r8+-64] + vpxor ymm13, ymm6, [r8+128] + vpxor ymm14, ymm7, [r9+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-96] + vpxor ymm11, ymm7, [rcx+96] + vpxor ymm12, ymm8, [r8+32] + vpxor ymm13, ymm9, [r9+-32] + vpxor ymm14, ymm5, [r9] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx] + vpxor ymm11, ymm5, [rcx+32] + vpxor ymm12, ymm6, [r8+-32] + vpxor ymm13, ymm7, [r9+-96] + vpxor ymm14, ymm8, [r9+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-64] + vpxor ymm11, ymm8, [rcx+128] + vpxor ymm12, ymm9, [r8+64] + vpxor ymm13, ymm5, [r8+96] + vpxor ymm14, ymm6, [r9+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Round 1 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm12, ymm2, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm13, ymm3, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm14, ymm4, [r9] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-96] + vpxor ymm12, ymm7, [r8+32] + vpxor ymm13, ymm8, [r9+-96] + vpxor ymm14, ymm9, [r9+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+32] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-64] + vpxor ymm11, ymm9, [r9+64] + vpxor ymm12, ymm5, [rcx+-96] + vpxor ymm13, ymm6, [rcx+32] + vpxor ymm14, ymm7, [r8+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+64] + vpxor ymm11, ymm7, [r8+-64] + vpxor ymm12, ymm8, [r9+-32] + vpxor ymm13, ymm9, [r9+96] + vpxor ymm14, ymm5, [rcx+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+128] + vpxor ymm11, ymm5, [rcx+-32] + vpxor ymm12, ymm6, [rcx+96] + vpxor ymm13, ymm7, [r8+-32] + vpxor ymm14, ymm8, [r8+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8] + vpxor ymm11, ymm8, [r8+128] + vpxor ymm12, ymm9, [r9] + vpxor ymm13, ymm5, [rcx] + vpxor ymm14, ymm6, [rcx+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Round 2 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+64] + vpxor ymm12, ymm7, [r9+-32] + vpxor ymm13, ymm8, [r8+-32] + vpxor ymm14, ymm9, [rcx+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+64] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-96] + vpxor ymm11, ymm9, [r8+64] + vpxor ymm12, ymm5, [rcx+64] + vpxor ymm13, ymm6, [rcx+-32] + vpxor ymm14, ymm7, [r9] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-96] + vpxor ymm11, ymm7, [rcx+-96] + vpxor ymm12, ymm8, [r9+96] + vpxor ymm13, ymm9, [r8+96] + vpxor ymm14, ymm5, [r8] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+32] + vpxor ymm11, ymm5, [r9+-64] + vpxor ymm12, ymm6, [r8+-64] + vpxor ymm13, ymm7, [rcx+96] + vpxor ymm14, ymm8, [rcx] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+32] + vpxor ymm11, ymm8, [rcx+32] + vpxor ymm12, ymm9, [rcx+-64] + vpxor ymm13, ymm5, [r9+128] + vpxor ymm14, ymm6, [r8+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Round 3 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm2, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm14, ymm14, [r9] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm12, ymm12, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+64] + vpxor ymm12, ymm7, [r9+96] + vpxor ymm13, ymm8, [rcx+96] + vpxor ymm14, ymm9, [r8+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+96] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-32] + vpxor ymm11, ymm9, [r9] + vpxor ymm12, ymm5, [r8+-96] + vpxor ymm13, ymm6, [r9+-64] + vpxor ymm14, ymm7, [rcx+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+64] + vpxor ymm11, ymm7, [rcx+64] + vpxor ymm12, ymm8, [r8+96] + vpxor ymm13, ymm9, [rcx] + vpxor ymm14, ymm5, [r8+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+128] + vpxor ymm11, ymm5, [r9+-96] + vpxor ymm12, ymm6, [rcx+-96] + vpxor ymm13, ymm7, [r8+-64] + vpxor ymm14, ymm8, [r9+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-32] + vpxor ymm11, ymm8, [rcx+-32] + vpxor ymm12, ymm9, [r8] + vpxor ymm13, ymm5, [r9+32] + vpxor ymm14, ymm6, [rcx+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Round 4 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm1, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm11, ymm11, [r9] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9] + vpxor ymm12, ymm7, [r8+96] + vpxor ymm13, ymm8, [r8+-64] + vpxor ymm14, ymm9, [rcx+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+128] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+96] + vpxor ymm11, ymm9, [rcx+-64] + vpxor ymm12, ymm5, [r9+64] + vpxor ymm13, ymm6, [r9+-96] + vpxor ymm14, ymm7, [r8] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+64] + vpxor ymm11, ymm7, [r8+-96] + vpxor ymm12, ymm8, [rcx] + vpxor ymm13, ymm9, [r9+128] + vpxor ymm14, ymm5, [r9+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+128] + vpxor ymm11, ymm5, [r8+-32] + vpxor ymm12, ymm6, [rcx+64] + vpxor ymm13, ymm7, [rcx+-96] + vpxor ymm14, ymm8, [r9+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+96] + vpxor ymm11, ymm8, [r9+-64] + vpxor ymm12, ymm9, [r8+32] + vpxor ymm13, ymm5, [rcx+128] + vpxor ymm14, ymm6, [rcx+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Round 5 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm12, ymm2, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm11, ymm11, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-64] + vpxor ymm12, ymm7, [rcx] + vpxor ymm13, ymm8, [rcx+-96] + vpxor ymm14, ymm9, [rcx+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+160] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-64] + vpxor ymm11, ymm9, [r8] + vpxor ymm12, ymm5, [r8+64] + vpxor ymm13, ymm6, [r8+-32] + vpxor ymm14, ymm7, [r8+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9] + vpxor ymm11, ymm7, [r9+64] + vpxor ymm12, ymm8, [r9+128] + vpxor ymm13, ymm9, [r9+32] + vpxor ymm14, ymm5, [r9+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+32] + vpxor ymm11, ymm5, [rcx+96] + vpxor ymm12, ymm6, [r8+-96] + vpxor ymm13, ymm7, [rcx+64] + vpxor ymm14, ymm8, [rcx+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+96] + vpxor ymm11, ymm8, [r9+-96] + vpxor ymm12, ymm9, [r9+-32] + vpxor ymm13, ymm5, [r8+128] + vpxor ymm14, ymm6, [r9+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Round 6 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm10, ymm10, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8] + vpxor ymm12, ymm7, [r9+128] + vpxor ymm13, ymm8, [rcx+64] + vpxor ymm14, ymm9, [r9+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+192] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-96] + vpxor ymm11, ymm9, [r8+32] + vpxor ymm12, ymm5, [r9] + vpxor ymm13, ymm6, [rcx+96] + vpxor ymm14, ymm7, [r9+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-64] + vpxor ymm11, ymm7, [r8+64] + vpxor ymm12, ymm8, [r9+32] + vpxor ymm13, ymm9, [rcx+128] + vpxor ymm14, ymm5, [r8+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-32] + vpxor ymm11, ymm5, [r8+-64] + vpxor ymm12, ymm6, [r9+64] + vpxor ymm13, ymm7, [r8+-96] + vpxor ymm14, ymm8, [r8+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx] + vpxor ymm11, ymm8, [r8+-32] + vpxor ymm12, ymm9, [r9+96] + vpxor ymm13, ymm5, [rcx+32] + vpxor ymm14, ymm6, [r9+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Round 7 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm13, ymm3, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm11, ymm1, [r8+-64] + vpxor ymm11, ymm11, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm14, ymm4, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm12, ymm2, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+32] + vpxor ymm12, ymm7, [r9+32] + vpxor ymm13, ymm8, [r8+-96] + vpxor ymm14, ymm9, [r9+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+224] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+64] + vpxor ymm11, ymm9, [r9+-32] + vpxor ymm12, ymm5, [rcx+-64] + vpxor ymm13, ymm6, [r8+-64] + vpxor ymm14, ymm7, [r9+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8] + vpxor ymm11, ymm7, [r9] + vpxor ymm12, ymm8, [rcx+128] + vpxor ymm13, ymm9, [r8+128] + vpxor ymm14, ymm5, [rcx] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-64] + vpxor ymm11, ymm5, [rcx+-96] + vpxor ymm12, ymm6, [r8+64] + vpxor ymm13, ymm7, [r9+64] + vpxor ymm14, ymm8, [rcx+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+128] + vpxor ymm11, ymm8, [rcx+96] + vpxor ymm12, ymm9, [r8+96] + vpxor ymm13, ymm5, [rcx+-32] + vpxor ymm14, ymm6, [r8+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Round 8 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm14, ymm4, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm13, ymm3, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm10, ymm10, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm11, ymm11, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm14, ymm14, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-32] + vpxor ymm12, ymm7, [rcx+128] + vpxor ymm13, ymm8, [r9+64] + vpxor ymm14, ymm9, [r8+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+256] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-96] + vpxor ymm11, ymm9, [r9+96] + vpxor ymm12, ymm5, [r8] + vpxor ymm13, ymm6, [rcx+-96] + vpxor ymm14, ymm7, [r8+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+32] + vpxor ymm11, ymm7, [rcx+-64] + vpxor ymm12, ymm8, [r8+128] + vpxor ymm13, ymm9, [rcx+32] + vpxor ymm14, ymm5, [r9+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-96] + vpxor ymm11, ymm5, [rcx+64] + vpxor ymm12, ymm6, [r9] + vpxor ymm13, ymm7, [r8+64] + vpxor ymm14, ymm8, [rcx+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+32] + vpxor ymm11, ymm8, [r8+-64] + vpxor ymm12, ymm9, [rcx] + vpxor ymm13, ymm5, [r9+-64] + vpxor ymm14, ymm6, [rcx+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Round 9 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm12, ymm2, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm12, ymm12, [r9] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+96] + vpxor ymm12, ymm7, [r8+128] + vpxor ymm13, ymm8, [r8+64] + vpxor ymm14, ymm9, [rcx+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+288] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+64] + vpxor ymm11, ymm9, [r8+96] + vpxor ymm12, ymm5, [r8+32] + vpxor ymm13, ymm6, [rcx+64] + vpxor ymm14, ymm7, [rcx] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-32] + vpxor ymm11, ymm7, [r8] + vpxor ymm12, ymm8, [rcx+32] + vpxor ymm13, ymm9, [rcx+-32] + vpxor ymm14, ymm5, [r9+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-32] + vpxor ymm11, ymm5, [r8+-96] + vpxor ymm12, ymm6, [rcx+-64] + vpxor ymm13, ymm7, [r9] + vpxor ymm14, ymm8, [r9+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+128] + vpxor ymm11, ymm8, [rcx+-96] + vpxor ymm12, ymm9, [r9+128] + vpxor ymm13, ymm5, [r9+-96] + vpxor ymm14, ymm6, [r8+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Round 10 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm1, [r8+-96] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm11, ymm11, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+96] + vpxor ymm12, ymm7, [rcx+32] + vpxor ymm13, ymm8, [r9] + vpxor ymm14, ymm9, [r8+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+320] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+64] + vpxor ymm11, ymm9, [rcx] + vpxor ymm12, ymm5, [r9+-32] + vpxor ymm13, ymm6, [r8+-96] + vpxor ymm14, ymm7, [r9+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+96] + vpxor ymm11, ymm7, [r8+32] + vpxor ymm12, ymm8, [rcx+-32] + vpxor ymm13, ymm9, [r9+-64] + vpxor ymm14, ymm5, [rcx+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+96] + vpxor ymm11, ymm5, [r9+64] + vpxor ymm12, ymm6, [r8] + vpxor ymm13, ymm7, [rcx+-64] + vpxor ymm14, ymm8, [r9+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+128] + vpxor ymm11, ymm8, [rcx+64] + vpxor ymm12, ymm9, [r9+32] + vpxor ymm13, ymm5, [r8+-32] + vpxor ymm14, ymm6, [rcx+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Round 11 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm14, ymm4, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm12, ymm12, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm10, ymm10, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx] + vpxor ymm12, ymm7, [rcx+-32] + vpxor ymm13, ymm8, [rcx+-64] + vpxor ymm14, ymm9, [rcx+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+352] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9] + vpxor ymm11, ymm9, [r9+128] + vpxor ymm12, ymm5, [r9+96] + vpxor ymm13, ymm6, [r9+64] + vpxor ymm14, ymm7, [r9+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+96] + vpxor ymm11, ymm7, [r9+-32] + vpxor ymm12, ymm8, [r9+-64] + vpxor ymm13, ymm9, [r9+-96] + vpxor ymm14, ymm5, [r8+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-64] + vpxor ymm11, ymm5, [r8+64] + vpxor ymm12, ymm6, [r8+32] + vpxor ymm13, ymm7, [r8] + vpxor ymm14, ymm8, [r8+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+32] + vpxor ymm11, ymm8, [r8+-96] + vpxor ymm12, ymm9, [rcx+128] + vpxor ymm13, ymm5, [rcx+96] + vpxor ymm14, ymm6, [rcx+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Round 12 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm10, ymm10, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+128] + vpxor ymm12, ymm7, [r9+-64] + vpxor ymm13, ymm8, [r8] + vpxor ymm14, ymm9, [rcx+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+384] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-64] + vpxor ymm11, ymm9, [r9+32] + vpxor ymm12, ymm5, [r8+96] + vpxor ymm13, ymm6, [r8+64] + vpxor ymm14, ymm7, [rcx+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx] + vpxor ymm11, ymm7, [r9+96] + vpxor ymm12, ymm8, [r9+-96] + vpxor ymm13, ymm9, [r8+-32] + vpxor ymm14, ymm5, [rcx+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-96] + vpxor ymm11, ymm5, [r9] + vpxor ymm12, ymm6, [r9+-32] + vpxor ymm13, ymm7, [r8+32] + vpxor ymm14, ymm8, [rcx+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-32] + vpxor ymm11, ymm8, [r9+64] + vpxor ymm12, ymm9, [r8+128] + vpxor ymm13, ymm5, [r8+-64] + vpxor ymm14, ymm6, [r8+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Round 13 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm13, ymm3, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm12, ymm2, [r8+96] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm11, ymm1, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+32] + vpxor ymm12, ymm7, [r9+-96] + vpxor ymm13, ymm8, [r8+32] + vpxor ymm14, ymm9, [r8+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+416] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8] + vpxor ymm11, ymm9, [rcx+128] + vpxor ymm12, ymm5, [rcx] + vpxor ymm13, ymm6, [r9] + vpxor ymm14, ymm7, [r8+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+128] + vpxor ymm11, ymm7, [r8+96] + vpxor ymm12, ymm8, [r8+-32] + vpxor ymm13, ymm9, [rcx+96] + vpxor ymm14, ymm5, [rcx+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+64] + vpxor ymm11, ymm5, [rcx+-64] + vpxor ymm12, ymm6, [r9+96] + vpxor ymm13, ymm7, [r9+-32] + vpxor ymm14, ymm8, [r8+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-64] + vpxor ymm11, ymm8, [r8+64] + vpxor ymm12, ymm9, [rcx+32] + vpxor ymm13, ymm5, [rcx+-96] + vpxor ymm14, ymm6, [r9+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Round 14 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm13, ymm3, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+128] + vpxor ymm12, ymm7, [r8+-32] + vpxor ymm13, ymm8, [r9+-32] + vpxor ymm14, ymm9, [r9+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+448] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+32] + vpxor ymm11, ymm9, [r8+128] + vpxor ymm12, ymm5, [r9+128] + vpxor ymm13, ymm6, [rcx+-64] + vpxor ymm14, ymm7, [rcx+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+32] + vpxor ymm11, ymm7, [rcx] + vpxor ymm12, ymm8, [rcx+96] + vpxor ymm13, ymm9, [r8+-64] + vpxor ymm14, ymm5, [r9+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-96] + vpxor ymm11, ymm5, [r8] + vpxor ymm12, ymm6, [r8+96] + vpxor ymm13, ymm7, [r9+96] + vpxor ymm14, ymm8, [rcx+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-96] + vpxor ymm11, ymm8, [r9] + vpxor ymm12, ymm9, [rcx+-32] + vpxor ymm13, ymm5, [rcx+64] + vpxor ymm14, ymm6, [r8+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Round 15 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm11, ymm1, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm12, ymm2, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+128] + vpxor ymm12, ymm7, [rcx+96] + vpxor ymm13, ymm8, [r9+96] + vpxor ymm14, ymm9, [r8+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+480] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-32] + vpxor ymm11, ymm9, [rcx+32] + vpxor ymm12, ymm5, [r9+32] + vpxor ymm13, ymm6, [r8] + vpxor ymm14, ymm7, [rcx+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+128] + vpxor ymm11, ymm7, [r9+128] + vpxor ymm12, ymm8, [r8+-64] + vpxor ymm13, ymm9, [rcx+-96] + vpxor ymm14, ymm5, [r9+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+64] + vpxor ymm11, ymm5, [r8+32] + vpxor ymm12, ymm6, [rcx] + vpxor ymm13, ymm7, [r8+96] + vpxor ymm14, ymm8, [rcx+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-32] + vpxor ymm11, ymm8, [rcx+-64] + vpxor ymm12, ymm9, [r9+-64] + vpxor ymm13, ymm5, [r8+-96] + vpxor ymm14, ymm6, [r9] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Round 16 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm13, ymm13, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+32] + vpxor ymm12, ymm7, [r8+-64] + vpxor ymm13, ymm8, [r8+96] + vpxor ymm14, ymm9, [r9] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+512] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+96] + vpxor ymm11, ymm9, [rcx+-32] + vpxor ymm12, ymm5, [rcx+128] + vpxor ymm13, ymm6, [r8+32] + vpxor ymm14, ymm7, [r9+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+128] + vpxor ymm11, ymm7, [r9+32] + vpxor ymm12, ymm8, [rcx+-96] + vpxor ymm13, ymm9, [rcx+64] + vpxor ymm14, ymm5, [r8+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+64] + vpxor ymm11, ymm5, [r9+-32] + vpxor ymm12, ymm6, [r9+128] + vpxor ymm13, ymm7, [rcx] + vpxor ymm14, ymm8, [r8+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+96] + vpxor ymm11, ymm8, [r8] + vpxor ymm12, ymm9, [r9+-96] + vpxor ymm13, ymm5, [r9+64] + vpxor ymm14, ymm6, [rcx+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Round 17 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm14, ymm4, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm14, ymm14, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm10, ymm10, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-32] + vpxor ymm12, ymm7, [rcx+-96] + vpxor ymm13, ymm8, [rcx] + vpxor ymm14, ymm9, [rcx+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+544] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+96] + vpxor ymm11, ymm9, [r9+-64] + vpxor ymm12, ymm5, [r8+128] + vpxor ymm13, ymm6, [r9+-32] + vpxor ymm14, ymm7, [r9+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+32] + vpxor ymm11, ymm7, [rcx+128] + vpxor ymm12, ymm8, [rcx+64] + vpxor ymm13, ymm9, [r8+-96] + vpxor ymm14, ymm5, [rcx+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9] + vpxor ymm11, ymm5, [r9+96] + vpxor ymm12, ymm6, [r9+32] + vpxor ymm13, ymm7, [r9+128] + vpxor ymm14, ymm8, [r9+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-64] + vpxor ymm11, ymm8, [r8+32] + vpxor ymm12, ymm9, [r8+-32] + vpxor ymm13, ymm5, [r8+64] + vpxor ymm14, ymm6, [r8] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Round 18 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm10, ymm10, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-64] + vpxor ymm12, ymm7, [rcx+64] + vpxor ymm13, ymm8, [r9+128] + vpxor ymm14, ymm9, [r8] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+576] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx] + vpxor ymm11, ymm9, [r9+-96] + vpxor ymm12, ymm5, [rcx+32] + vpxor ymm13, ymm6, [r9+96] + vpxor ymm14, ymm7, [r8+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-32] + vpxor ymm11, ymm7, [r8+128] + vpxor ymm12, ymm8, [r8+-96] + vpxor ymm13, ymm9, [r9+64] + vpxor ymm14, ymm5, [r8+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-64] + vpxor ymm11, ymm5, [r8+96] + vpxor ymm12, ymm6, [rcx+128] + vpxor ymm13, ymm7, [r9+32] + vpxor ymm14, ymm8, [r8+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-96] + vpxor ymm11, ymm8, [r9+-32] + vpxor ymm12, ymm9, [rcx+96] + vpxor ymm13, ymm5, [r9] + vpxor ymm14, ymm6, [r8+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Round 19 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm14, ymm4, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm11, ymm1, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm13, ymm3, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-96] + vpxor ymm12, ymm7, [r8+-96] + vpxor ymm13, ymm8, [r9+32] + vpxor ymm14, ymm9, [r8+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+608] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+128] + vpxor ymm11, ymm9, [r8+-32] + vpxor ymm12, ymm5, [rcx+-32] + vpxor ymm13, ymm6, [r8+96] + vpxor ymm14, ymm7, [rcx+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-64] + vpxor ymm11, ymm7, [rcx+32] + vpxor ymm12, ymm8, [r9+64] + vpxor ymm13, ymm9, [r8+64] + vpxor ymm14, ymm5, [rcx+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8] + vpxor ymm11, ymm5, [rcx] + vpxor ymm12, ymm6, [r8+128] + vpxor ymm13, ymm7, [rcx+128] + vpxor ymm14, ymm8, [r9] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+64] + vpxor ymm11, ymm8, [r9+96] + vpxor ymm12, ymm9, [r8+-64] + vpxor ymm13, ymm5, [rcx+-64] + vpxor ymm14, ymm6, [r9+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Round 20 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm13, ymm3, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm14, ymm14, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-32] + vpxor ymm12, ymm7, [r9+64] + vpxor ymm13, ymm8, [rcx+128] + vpxor ymm14, ymm9, [r9+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+640] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+32] + vpxor ymm11, ymm9, [rcx+96] + vpxor ymm12, ymm5, [r9+-64] + vpxor ymm13, ymm6, [rcx] + vpxor ymm14, ymm7, [r8+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-96] + vpxor ymm11, ymm7, [rcx+-32] + vpxor ymm12, ymm8, [r8+64] + vpxor ymm13, ymm9, [r9] + vpxor ymm14, ymm5, [rcx+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+32] + vpxor ymm11, ymm5, [r9+128] + vpxor ymm12, ymm6, [rcx+32] + vpxor ymm13, ymm7, [r8+128] + vpxor ymm14, ymm8, [rcx+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-96] + vpxor ymm11, ymm8, [r8+96] + vpxor ymm12, ymm9, [rcx+-96] + vpxor ymm13, ymm5, [r8] + vpxor ymm14, ymm6, [r9+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Round 21 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+96] + vpxor ymm12, ymm7, [r8+64] + vpxor ymm13, ymm8, [r8+128] + vpxor ymm14, ymm9, [r9+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+672] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+128] + vpxor ymm11, ymm9, [r8+-64] + vpxor ymm12, ymm5, [r9+-96] + vpxor ymm13, ymm6, [r9+128] + vpxor ymm14, ymm7, [rcx+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-32] + vpxor ymm11, ymm7, [r9+-64] + vpxor ymm12, ymm8, [r9] + vpxor ymm13, ymm9, [rcx+-64] + vpxor ymm14, ymm5, [r8+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-32] + vpxor ymm11, ymm5, [r9+32] + vpxor ymm12, ymm6, [rcx+-32] + vpxor ymm13, ymm7, [rcx+32] + vpxor ymm14, ymm8, [r8] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+64] + vpxor ymm11, ymm8, [rcx] + vpxor ymm12, ymm9, [rcx+64] + vpxor ymm13, ymm5, [r8+32] + vpxor ymm14, ymm6, [r8+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Round 22 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm1, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm12, ymm12, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-64] + vpxor ymm12, ymm7, [r9] + vpxor ymm13, ymm8, [rcx+32] + vpxor ymm14, ymm9, [r8+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+704] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+128] + vpxor ymm11, ymm9, [rcx+-96] + vpxor ymm12, ymm5, [r8+-32] + vpxor ymm13, ymm6, [r9+32] + vpxor ymm14, ymm7, [rcx+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+96] + vpxor ymm11, ymm7, [r9+-96] + vpxor ymm12, ymm8, [rcx+-64] + vpxor ymm13, ymm9, [r8] + vpxor ymm14, ymm5, [r9+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+96] + vpxor ymm11, ymm5, [rcx+128] + vpxor ymm12, ymm6, [r9+-64] + vpxor ymm13, ymm7, [rcx+-32] + vpxor ymm14, ymm8, [r8+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+64] + vpxor ymm11, ymm8, [r9+128] + vpxor ymm12, ymm9, [r8+-96] + vpxor ymm13, ymm5, [r9+-32] + vpxor ymm14, ymm6, [rcx] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Round 23 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm14, ymm4, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm12, ymm12, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm10, ymm10, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-96] + vpxor ymm12, ymm7, [rcx+-64] + vpxor ymm13, ymm8, [rcx+-32] + vpxor ymm14, ymm9, [rcx] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+736] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+32] + vpxor ymm11, ymm9, [rcx+64] + vpxor ymm12, ymm5, [rcx+96] + vpxor ymm13, ymm6, [rcx+128] + vpxor ymm14, ymm7, [r8+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-64] + vpxor ymm11, ymm7, [r8+-32] + vpxor ymm12, ymm8, [r8] + vpxor ymm13, ymm9, [r8+32] + vpxor ymm14, ymm5, [r8+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+96] + vpxor ymm11, ymm5, [r8+128] + vpxor ymm12, ymm6, [r9+-96] + vpxor ymm13, ymm7, [r9+-64] + vpxor ymm14, ymm8, [r9+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9] + vpxor ymm11, ymm8, [r9+32] + vpxor ymm12, ymm9, [r9+64] + vpxor ymm13, ymm5, [r9+96] + vpxor ymm14, ymm6, [r9+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + sub rcx, 128 + vmovdqu YMMWORD PTR [rcx], ymm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +sha3_256_blocksx4_seed_avx2 ENDP +_TEXT ENDS +ENDIF +IFDEF WOLFSSL_HAVE_MLDSA +_DATA SEGMENT +ALIGN 16 +L_sha3_256_blockx4_seed_64_avx2_end_mark QWORD 8000000000000000h, 8000000000000000h + QWORD 8000000000000000h, 8000000000000000h +ptr_L_sha3_256_blockx4_seed_64_avx2_end_mark QWORD L_sha3_256_blockx4_seed_64_avx2_end_mark +_DATA ENDS +_TEXT SEGMENT READONLY PARA +sha3_256_blocksx4_seed_64_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + mov rax, QWORD PTR [ptr_L_sha3_x4_avx2_r] + mov r8, rcx + mov r9, rcx + vpbroadcastq ymm15, QWORD PTR [rdx] + add rcx, 128 + vpbroadcastq ymm11, QWORD PTR [rdx+8] + add r8, 384 + vpbroadcastq ymm12, QWORD PTR [rdx+16] + add r9, 640 + vpbroadcastq ymm13, QWORD PTR [rdx+24] + vpbroadcastq ymm14, QWORD PTR [rdx+32] + vpbroadcastq ymm0, QWORD PTR [rdx+40] + vpbroadcastq ymm1, QWORD PTR [rdx+48] + vpbroadcastq ymm2, QWORD PTR [rdx+56] + vmovdqu ymm3, YMMWORD PTR [rcx+128] + vmovdqu YMMWORD PTR [rcx+-96], ymm11 + vmovdqu YMMWORD PTR [rcx+-64], ymm12 + vmovdqu YMMWORD PTR [rcx+-32], ymm13 + vmovdqu YMMWORD PTR [rcx], ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vpxor ymm4, ymm4, ymm4 + vpxor ymm6, ymm6, ymm6 + vmovdqu ymm5, YMMWORD PTR L_sha3_256_blockx4_seed_64_avx2_end_mark + vmovdqu YMMWORD PTR [r8+-96], ymm6 + vmovdqu YMMWORD PTR [r8+-64], ymm6 + vmovdqu YMMWORD PTR [r8+-32], ymm6 + vmovdqu YMMWORD PTR [r8], ymm6 + vmovdqu YMMWORD PTR [r8+32], ymm6 + vmovdqu YMMWORD PTR [r8+64], ymm6 + vmovdqu YMMWORD PTR [r8+96], ymm6 + vmovdqu YMMWORD PTR [r8+128], ymm5 + vmovdqu YMMWORD PTR [r9+-96], ymm6 + vmovdqu YMMWORD PTR [r9+-64], ymm6 + vmovdqu YMMWORD PTR [r9+-32], ymm6 + vmovdqu YMMWORD PTR [r9], ymm6 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [r9+64], ymm6 + vmovdqu YMMWORD PTR [r9+96], ymm6 + vmovdqu YMMWORD PTR [r9+128], ymm6 + vmovdqu ymm10, ymm15 + ; Round 0 + ; Calc b[0..4] + vpxor ymm10, ymm15, ymm0 + vpxor ymm11, ymm11, ymm1 + vpxor ymm12, ymm12, ymm2 + vpxor ymm13, ymm13, ymm3 + vpxor ymm14, ymm14, ymm4 + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm10, ymm10, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+64] + vpxor ymm12, ymm7, [r8] + vpxor ymm13, ymm8, [r9+-64] + vpxor ymm14, ymm9, [r9+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-32] + vpxor ymm11, ymm9, [r8+-96] + vpxor ymm12, ymm5, [r8+-64] + vpxor ymm13, ymm6, [r8+128] + vpxor ymm14, ymm7, [r9+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-96] + vpxor ymm11, ymm7, [rcx+96] + vpxor ymm12, ymm8, [r8+32] + vpxor ymm13, ymm9, [r9+-32] + vpxor ymm14, ymm5, [r9] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx] + vpxor ymm11, ymm5, [rcx+32] + vpxor ymm12, ymm6, [r8+-32] + vpxor ymm13, ymm7, [r9+-96] + vpxor ymm14, ymm8, [r9+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-64] + vpxor ymm11, ymm8, [rcx+128] + vpxor ymm12, ymm9, [r8+64] + vpxor ymm13, ymm5, [r8+96] + vpxor ymm14, ymm6, [r9+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Round 1 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm12, ymm2, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm13, ymm3, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm14, ymm4, [r9] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-96] + vpxor ymm12, ymm7, [r8+32] + vpxor ymm13, ymm8, [r9+-96] + vpxor ymm14, ymm9, [r9+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+32] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-64] + vpxor ymm11, ymm9, [r9+64] + vpxor ymm12, ymm5, [rcx+-96] + vpxor ymm13, ymm6, [rcx+32] + vpxor ymm14, ymm7, [r8+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+64] + vpxor ymm11, ymm7, [r8+-64] + vpxor ymm12, ymm8, [r9+-32] + vpxor ymm13, ymm9, [r9+96] + vpxor ymm14, ymm5, [rcx+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+128] + vpxor ymm11, ymm5, [rcx+-32] + vpxor ymm12, ymm6, [rcx+96] + vpxor ymm13, ymm7, [r8+-32] + vpxor ymm14, ymm8, [r8+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8] + vpxor ymm11, ymm8, [r8+128] + vpxor ymm12, ymm9, [r9] + vpxor ymm13, ymm5, [rcx] + vpxor ymm14, ymm6, [rcx+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Round 2 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+64] + vpxor ymm12, ymm7, [r9+-32] + vpxor ymm13, ymm8, [r8+-32] + vpxor ymm14, ymm9, [rcx+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+64] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-96] + vpxor ymm11, ymm9, [r8+64] + vpxor ymm12, ymm5, [rcx+64] + vpxor ymm13, ymm6, [rcx+-32] + vpxor ymm14, ymm7, [r9] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-96] + vpxor ymm11, ymm7, [rcx+-96] + vpxor ymm12, ymm8, [r9+96] + vpxor ymm13, ymm9, [r8+96] + vpxor ymm14, ymm5, [r8] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+32] + vpxor ymm11, ymm5, [r9+-64] + vpxor ymm12, ymm6, [r8+-64] + vpxor ymm13, ymm7, [rcx+96] + vpxor ymm14, ymm8, [rcx] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+32] + vpxor ymm11, ymm8, [rcx+32] + vpxor ymm12, ymm9, [rcx+-64] + vpxor ymm13, ymm5, [r9+128] + vpxor ymm14, ymm6, [r8+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Round 3 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm2, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm14, ymm14, [r9] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm12, ymm12, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+64] + vpxor ymm12, ymm7, [r9+96] + vpxor ymm13, ymm8, [rcx+96] + vpxor ymm14, ymm9, [r8+128] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+96] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-32] + vpxor ymm11, ymm9, [r9] + vpxor ymm12, ymm5, [r8+-96] + vpxor ymm13, ymm6, [r9+-64] + vpxor ymm14, ymm7, [rcx+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+64] + vpxor ymm11, ymm7, [rcx+64] + vpxor ymm12, ymm8, [r8+96] + vpxor ymm13, ymm9, [rcx] + vpxor ymm14, ymm5, [r8+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+128] + vpxor ymm11, ymm5, [r9+-96] + vpxor ymm12, ymm6, [rcx+-96] + vpxor ymm13, ymm7, [r8+-64] + vpxor ymm14, ymm8, [r9+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-32] + vpxor ymm11, ymm8, [rcx+-32] + vpxor ymm12, ymm9, [r8] + vpxor ymm13, ymm5, [r9+32] + vpxor ymm14, ymm6, [rcx+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Round 4 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm1, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm11, ymm11, [r9] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9] + vpxor ymm12, ymm7, [r8+96] + vpxor ymm13, ymm8, [r8+-64] + vpxor ymm14, ymm9, [rcx+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+128] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+96] + vpxor ymm11, ymm9, [rcx+-64] + vpxor ymm12, ymm5, [r9+64] + vpxor ymm13, ymm6, [r9+-96] + vpxor ymm14, ymm7, [r8] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+64] + vpxor ymm11, ymm7, [r8+-96] + vpxor ymm12, ymm8, [rcx] + vpxor ymm13, ymm9, [r9+128] + vpxor ymm14, ymm5, [r9+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+128] + vpxor ymm11, ymm5, [r8+-32] + vpxor ymm12, ymm6, [rcx+64] + vpxor ymm13, ymm7, [rcx+-96] + vpxor ymm14, ymm8, [r9+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+96] + vpxor ymm11, ymm8, [r9+-64] + vpxor ymm12, ymm9, [r8+32] + vpxor ymm13, ymm5, [rcx+128] + vpxor ymm14, ymm6, [rcx+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Round 5 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm12, ymm2, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm11, ymm11, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-64] + vpxor ymm12, ymm7, [rcx] + vpxor ymm13, ymm8, [rcx+-96] + vpxor ymm14, ymm9, [rcx+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+160] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-64] + vpxor ymm11, ymm9, [r8] + vpxor ymm12, ymm5, [r8+64] + vpxor ymm13, ymm6, [r8+-32] + vpxor ymm14, ymm7, [r8+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9] + vpxor ymm11, ymm7, [r9+64] + vpxor ymm12, ymm8, [r9+128] + vpxor ymm13, ymm9, [r9+32] + vpxor ymm14, ymm5, [r9+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+32] + vpxor ymm11, ymm5, [rcx+96] + vpxor ymm12, ymm6, [r8+-96] + vpxor ymm13, ymm7, [rcx+64] + vpxor ymm14, ymm8, [rcx+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+96] + vpxor ymm11, ymm8, [r9+-96] + vpxor ymm12, ymm9, [r9+-32] + vpxor ymm13, ymm5, [r8+128] + vpxor ymm14, ymm6, [r9+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Round 6 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm13, ymm13, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm10, ymm10, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8] + vpxor ymm12, ymm7, [r9+128] + vpxor ymm13, ymm8, [rcx+64] + vpxor ymm14, ymm9, [r9+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+192] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-96] + vpxor ymm11, ymm9, [r8+32] + vpxor ymm12, ymm5, [r9] + vpxor ymm13, ymm6, [rcx+96] + vpxor ymm14, ymm7, [r9+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-64] + vpxor ymm11, ymm7, [r8+64] + vpxor ymm12, ymm8, [r9+32] + vpxor ymm13, ymm9, [rcx+128] + vpxor ymm14, ymm5, [r8+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-32] + vpxor ymm11, ymm5, [r8+-64] + vpxor ymm12, ymm6, [r9+64] + vpxor ymm13, ymm7, [r8+-96] + vpxor ymm14, ymm8, [r8+128] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx] + vpxor ymm11, ymm8, [r8+-32] + vpxor ymm12, ymm9, [r9+96] + vpxor ymm13, ymm5, [rcx+32] + vpxor ymm14, ymm6, [r9+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Round 7 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm13, ymm3, [rcx+64] + vpxor ymm13, ymm13, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm11, ymm1, [r8+-64] + vpxor ymm11, ymm11, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm14, ymm4, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm12, ymm2, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+32] + vpxor ymm12, ymm7, [r9+32] + vpxor ymm13, ymm8, [r8+-96] + vpxor ymm14, ymm9, [r9+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+224] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+64] + vpxor ymm11, ymm9, [r9+-32] + vpxor ymm12, ymm5, [rcx+-64] + vpxor ymm13, ymm6, [r8+-64] + vpxor ymm14, ymm7, [r9+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8] + vpxor ymm11, ymm7, [r9] + vpxor ymm12, ymm8, [rcx+128] + vpxor ymm13, ymm9, [r8+128] + vpxor ymm14, ymm5, [rcx] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-64] + vpxor ymm11, ymm5, [rcx+-96] + vpxor ymm12, ymm6, [r8+64] + vpxor ymm13, ymm7, [r9+64] + vpxor ymm14, ymm8, [rcx+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+128] + vpxor ymm11, ymm8, [rcx+96] + vpxor ymm12, ymm9, [r8+96] + vpxor ymm13, ymm5, [rcx+-32] + vpxor ymm14, ymm6, [r8+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Round 8 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm14, ymm4, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm13, ymm3, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm10, ymm10, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm11, ymm11, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm14, ymm14, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-32] + vpxor ymm12, ymm7, [rcx+128] + vpxor ymm13, ymm8, [r9+64] + vpxor ymm14, ymm9, [r8+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+256] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+-96] + vpxor ymm11, ymm9, [r9+96] + vpxor ymm12, ymm5, [r8] + vpxor ymm13, ymm6, [rcx+-96] + vpxor ymm14, ymm7, [r8+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+32] + vpxor ymm11, ymm7, [rcx+-64] + vpxor ymm12, ymm8, [r8+128] + vpxor ymm13, ymm9, [rcx+32] + vpxor ymm14, ymm5, [r9+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-96] + vpxor ymm11, ymm5, [rcx+64] + vpxor ymm12, ymm6, [r9] + vpxor ymm13, ymm7, [r8+64] + vpxor ymm14, ymm8, [rcx+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+32] + vpxor ymm11, ymm8, [r8+-64] + vpxor ymm12, ymm9, [rcx] + vpxor ymm13, ymm5, [r9+-64] + vpxor ymm14, ymm6, [rcx+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Round 9 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm11, [rcx+64] + vpxor ymm12, ymm2, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm12, ymm12, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm12, ymm12, [r9] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+96] + vpxor ymm12, ymm7, [r8+128] + vpxor ymm13, ymm8, [r8+64] + vpxor ymm14, ymm9, [rcx+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+288] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+64] + vpxor ymm11, ymm9, [r8+96] + vpxor ymm12, ymm5, [r8+32] + vpxor ymm13, ymm6, [rcx+64] + vpxor ymm14, ymm7, [rcx] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-32] + vpxor ymm11, ymm7, [r8] + vpxor ymm12, ymm8, [rcx+32] + vpxor ymm13, ymm9, [rcx+-32] + vpxor ymm14, ymm5, [r9+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-32] + vpxor ymm11, ymm5, [r8+-96] + vpxor ymm12, ymm6, [rcx+-64] + vpxor ymm13, ymm7, [r9] + vpxor ymm14, ymm8, [r9+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+128] + vpxor ymm11, ymm8, [rcx+-96] + vpxor ymm12, ymm9, [r9+128] + vpxor ymm13, ymm5, [r9+-96] + vpxor ymm14, ymm6, [r8+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Round 10 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm14, ymm4, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm1, [r8+-96] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm11, ymm11, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+96] + vpxor ymm12, ymm7, [rcx+32] + vpxor ymm13, ymm8, [r9] + vpxor ymm14, ymm9, [r8+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+320] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+64] + vpxor ymm11, ymm9, [rcx] + vpxor ymm12, ymm5, [r9+-32] + vpxor ymm13, ymm6, [r8+-96] + vpxor ymm14, ymm7, [r9+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+96] + vpxor ymm11, ymm7, [r8+32] + vpxor ymm12, ymm8, [rcx+-32] + vpxor ymm13, ymm9, [r9+-64] + vpxor ymm14, ymm5, [rcx+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+96] + vpxor ymm11, ymm5, [r9+64] + vpxor ymm12, ymm6, [r8] + vpxor ymm13, ymm7, [rcx+-64] + vpxor ymm14, ymm8, [r9+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+128] + vpxor ymm11, ymm8, [rcx+64] + vpxor ymm12, ymm9, [r9+32] + vpxor ymm13, ymm5, [r8+-32] + vpxor ymm14, ymm6, [rcx+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Round 11 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm12, ymm12, [rcx+32] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm14, ymm4, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm12, ymm12, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm13, ymm13, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm11, ymm11, [r9+64] + vpxor ymm10, ymm10, [r9+96] + vpxor ymm14, ymm14, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx] + vpxor ymm12, ymm7, [rcx+-32] + vpxor ymm13, ymm8, [rcx+-64] + vpxor ymm14, ymm9, [rcx+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+352] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9] + vpxor ymm11, ymm9, [r9+128] + vpxor ymm12, ymm5, [r9+96] + vpxor ymm13, ymm6, [r9+64] + vpxor ymm14, ymm7, [r9+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r9+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+96] + vpxor ymm11, ymm7, [r9+-32] + vpxor ymm12, ymm8, [r9+-64] + vpxor ymm13, ymm9, [r9+-96] + vpxor ymm14, ymm5, [r8+128] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r9+-96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-64] + vpxor ymm11, ymm5, [r8+64] + vpxor ymm12, ymm6, [r8+32] + vpxor ymm13, ymm7, [r8] + vpxor ymm14, ymm8, [r8+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+32] + vpxor ymm11, ymm8, [r8+-96] + vpxor ymm12, ymm9, [rcx+128] + vpxor ymm13, ymm5, [rcx+96] + vpxor ymm14, ymm6, [rcx+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [r8+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Round 12 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm10, ymm10, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm12, ymm12, [r8+32] + vpxor ymm11, ymm11, [r8+64] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm13, ymm13, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm10, ymm10, [r9] + vpxor ymm14, ymm14, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+128] + vpxor ymm12, ymm7, [r9+-64] + vpxor ymm13, ymm8, [r8] + vpxor ymm14, ymm9, [rcx+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+384] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+-64] + vpxor ymm11, ymm9, [r9+32] + vpxor ymm12, ymm5, [r8+96] + vpxor ymm13, ymm6, [r8+64] + vpxor ymm14, ymm7, [rcx+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx] + vpxor ymm11, ymm7, [r9+96] + vpxor ymm12, ymm8, [r9+-96] + vpxor ymm13, ymm9, [r8+-32] + vpxor ymm14, ymm5, [rcx+32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r8+-32], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-96] + vpxor ymm11, ymm5, [r9] + vpxor ymm12, ymm6, [r9+-32] + vpxor ymm13, ymm7, [r8+32] + vpxor ymm14, ymm8, [rcx+96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [r9+-32], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-32] + vpxor ymm11, ymm8, [r9+64] + vpxor ymm12, ymm9, [r8+128] + vpxor ymm13, ymm5, [r8+-64] + vpxor ymm14, ymm6, [r8+-96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r9+64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Round 13 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-96] + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx] + vpxor ymm14, ymm4, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm14, ymm14, [rcx+128] + vpxor ymm13, ymm3, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm12, ymm2, [r8+96] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm12, ymm12, [r9+-32] + vpxor ymm11, ymm1, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+32] + vpxor ymm12, ymm7, [r9+-96] + vpxor ymm13, ymm8, [r8+32] + vpxor ymm14, ymm9, [r8+-96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+416] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8] + vpxor ymm11, ymm9, [rcx+128] + vpxor ymm12, ymm5, [rcx] + vpxor ymm13, ymm6, [r9] + vpxor ymm14, ymm7, [r8+128] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+128] + vpxor ymm11, ymm7, [r8+96] + vpxor ymm12, ymm8, [r8+-32] + vpxor ymm13, ymm9, [rcx+96] + vpxor ymm14, ymm5, [rcx+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+64] + vpxor ymm11, ymm5, [rcx+-64] + vpxor ymm12, ymm6, [r9+96] + vpxor ymm13, ymm7, [r9+-32] + vpxor ymm14, ymm8, [r8+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-64] + vpxor ymm11, ymm8, [r8+64] + vpxor ymm12, ymm9, [rcx+32] + vpxor ymm13, ymm5, [rcx+-96] + vpxor ymm14, ymm6, [r9+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Round 14 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-64] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm10, ymm10, [rcx+64] + vpxor ymm13, ymm3, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm11, ymm11, [r8+96] + vpxor ymm14, ymm14, [r8+128] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm12, ymm12, [r9+96] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+128] + vpxor ymm12, ymm7, [r8+-32] + vpxor ymm13, ymm8, [r9+-32] + vpxor ymm14, ymm9, [r9+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+448] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+32] + vpxor ymm11, ymm9, [r8+128] + vpxor ymm12, ymm5, [r9+128] + vpxor ymm13, ymm6, [rcx+-64] + vpxor ymm14, ymm7, [rcx+32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+32] + vpxor ymm11, ymm7, [rcx] + vpxor ymm12, ymm8, [rcx+96] + vpxor ymm13, ymm9, [r8+-64] + vpxor ymm14, ymm5, [r9+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r8+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+-96] + vpxor ymm11, ymm5, [r8] + vpxor ymm12, ymm6, [r8+96] + vpxor ymm13, ymm7, [r9+96] + vpxor ymm14, ymm8, [rcx+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+-96] + vpxor ymm11, ymm8, [r9] + vpxor ymm12, ymm9, [rcx+-32] + vpxor ymm13, ymm5, [rcx+64] + vpxor ymm14, ymm6, [r8+64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [r9], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Round 15 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm11, ymm1, [rcx] + vpxor ymm14, ymm14, [rcx+32] + vpxor ymm12, ymm2, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm10, ymm10, [r8+-96] + vpxor ymm13, ymm13, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm11, ymm11, [r8] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm12, ymm12, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+128] + vpxor ymm12, ymm7, [rcx+96] + vpxor ymm13, ymm8, [r9+96] + vpxor ymm14, ymm9, [r8+64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+480] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+-32] + vpxor ymm11, ymm9, [rcx+32] + vpxor ymm12, ymm5, [r9+32] + vpxor ymm13, ymm6, [r8] + vpxor ymm14, ymm7, [rcx+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [rcx+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+128] + vpxor ymm11, ymm7, [r9+128] + vpxor ymm12, ymm8, [r8+-64] + vpxor ymm13, ymm9, [rcx+-96] + vpxor ymm14, ymm5, [r9+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-96], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+64] + vpxor ymm11, ymm5, [r8+32] + vpxor ymm12, ymm6, [rcx] + vpxor ymm13, ymm7, [r8+96] + vpxor ymm14, ymm8, [rcx+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-32] + vpxor ymm11, ymm8, [rcx+-64] + vpxor ymm12, ymm9, [r9+-64] + vpxor ymm13, ymm5, [r8+-96] + vpxor ymm14, ymm6, [r9] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [rcx+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Round 16 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm13, ymm3, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-32] + vpxor ymm12, ymm2, [rcx] + vpxor ymm11, ymm1, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm12, ymm12, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm13, ymm13, [r8] + vpxor ymm11, ymm11, [r8+32] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm10, ymm10, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+32] + vpxor ymm12, ymm7, [r8+-64] + vpxor ymm13, ymm8, [r8+96] + vpxor ymm14, ymm9, [r9] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+512] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+96] + vpxor ymm11, ymm9, [rcx+-32] + vpxor ymm12, ymm5, [rcx+128] + vpxor ymm13, ymm6, [r8+32] + vpxor ymm14, ymm7, [r9+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r9+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+128] + vpxor ymm11, ymm7, [r9+32] + vpxor ymm12, ymm8, [rcx+-96] + vpxor ymm13, ymm9, [rcx+64] + vpxor ymm14, ymm5, [r8+-32] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+64] + vpxor ymm11, ymm5, [r9+-32] + vpxor ymm12, ymm6, [r9+128] + vpxor ymm13, ymm7, [rcx] + vpxor ymm14, ymm8, [r8+-96] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [r9+128], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+96] + vpxor ymm11, ymm8, [r8] + vpxor ymm12, ymm9, [r9+-96] + vpxor ymm13, ymm5, [r9+64] + vpxor ymm14, ymm6, [rcx+-64] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Round 17 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm13, ymm13, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm14, ymm4, [r8+-96] + vpxor ymm12, ymm12, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm13, ymm13, [r8+32] + vpxor ymm10, ymm10, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm14, ymm14, [r9+-64] + vpxor ymm11, ymm11, [r9+-32] + vpxor ymm14, ymm14, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm10, ymm10, [r9+96] + vpxor ymm12, ymm12, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-32] + vpxor ymm12, ymm7, [rcx+-96] + vpxor ymm13, ymm8, [rcx] + vpxor ymm14, ymm9, [rcx+-64] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+544] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+96] + vpxor ymm11, ymm9, [r9+-64] + vpxor ymm12, ymm5, [r8+128] + vpxor ymm13, ymm6, [r9+-32] + vpxor ymm14, ymm7, [r9+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [r9+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+32] + vpxor ymm11, ymm7, [rcx+128] + vpxor ymm12, ymm8, [rcx+64] + vpxor ymm13, ymm9, [r8+-96] + vpxor ymm14, ymm5, [rcx+96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r8+-96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9] + vpxor ymm11, ymm5, [r9+96] + vpxor ymm12, ymm6, [r9+32] + vpxor ymm13, ymm7, [r9+128] + vpxor ymm14, ymm8, [r9+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r9+32], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-64] + vpxor ymm11, ymm8, [r8+32] + vpxor ymm12, ymm9, [r8+-32] + vpxor ymm13, ymm5, [r8+64] + vpxor ymm14, ymm6, [r8] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Round 18 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm12, ymm2, [rcx+-96] + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm10, ymm10, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm13, ymm13, [r8+-96] + vpxor ymm10, ymm10, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm14, ymm14, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm13, ymm13, [r9+-32] + vpxor ymm10, ymm10, [r9] + vpxor ymm12, ymm12, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm11, ymm11, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-64] + vpxor ymm12, ymm7, [rcx+64] + vpxor ymm13, ymm8, [r9+128] + vpxor ymm14, ymm9, [r8] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+576] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx] + vpxor ymm11, ymm9, [r9+-96] + vpxor ymm12, ymm5, [rcx+32] + vpxor ymm13, ymm6, [r9+96] + vpxor ymm14, ymm7, [r8+-32] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r8+-32], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+-32] + vpxor ymm11, ymm7, [r8+128] + vpxor ymm12, ymm8, [r8+-96] + vpxor ymm13, ymm9, [r9+64] + vpxor ymm14, ymm5, [r8+-64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-32], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+64], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [rcx+-64] + vpxor ymm11, ymm5, [r8+96] + vpxor ymm12, ymm6, [rcx+128] + vpxor ymm13, ymm7, [r9+32] + vpxor ymm14, ymm8, [r8+64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+-96] + vpxor ymm11, ymm8, [r9+-32] + vpxor ymm12, ymm9, [rcx+96] + vpxor ymm13, ymm5, [r9] + vpxor ymm14, ymm6, [r8+32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm0 + vmovdqu YMMWORD PTR [r9+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Round 19 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm10, ymm10, [rcx+-64] + vpxor ymm10, ymm10, [rcx+-32] + vpxor ymm10, ymm10, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm12, ymm12, [rcx+64] + vpxor ymm12, ymm12, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm14, ymm4, [r8+-64] + vpxor ymm14, ymm14, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm14, ymm14, [r8+64] + vpxor ymm11, ymm1, [r8+96] + vpxor ymm11, ymm11, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm13, ymm3, [r9+32] + vpxor ymm13, ymm13, [r9+64] + vpxor ymm13, ymm13, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r9+-96] + vpxor ymm12, ymm7, [r8+-96] + vpxor ymm13, ymm8, [r9+32] + vpxor ymm14, ymm9, [r8+32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+608] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+128] + vpxor ymm11, ymm9, [r8+-32] + vpxor ymm12, ymm5, [rcx+-32] + vpxor ymm13, ymm6, [r8+96] + vpxor ymm14, ymm7, [rcx+96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+128], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-64] + vpxor ymm11, ymm7, [rcx+32] + vpxor ymm12, ymm8, [r9+64] + vpxor ymm13, ymm9, [r8+64] + vpxor ymm14, ymm5, [rcx+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-64], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8] + vpxor ymm11, ymm5, [rcx] + vpxor ymm12, ymm6, [r8+128] + vpxor ymm13, ymm7, [rcx+128] + vpxor ymm14, ymm8, [r9] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r9], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [rcx+64] + vpxor ymm11, ymm8, [r9+96] + vpxor ymm12, ymm9, [r8+-64] + vpxor ymm13, ymm5, [rcx+-64] + vpxor ymm14, ymm6, [r9+-32] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [r9+96], ymm1 + vmovdqu YMMWORD PTR [r8+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Round 20 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm11, ymm1, [rcx] + vpxor ymm11, ymm11, [rcx+32] + vpxor ymm14, ymm14, [rcx+96] + vpxor ymm13, ymm3, [rcx+128] + vpxor ymm12, ymm12, [r8+-96] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm10, ymm10, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm13, ymm13, [r8+64] + vpxor ymm13, ymm13, [r8+96] + vpxor ymm12, ymm12, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm10, ymm10, [r9+-64] + vpxor ymm14, ymm14, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm10, ymm10, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-32] + vpxor ymm12, ymm7, [r9+64] + vpxor ymm13, ymm8, [rcx+128] + vpxor ymm14, ymm9, [r9+-32] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+640] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r9+32] + vpxor ymm11, ymm9, [rcx+96] + vpxor ymm12, ymm5, [r9+-64] + vpxor ymm13, ymm6, [rcx] + vpxor ymm14, ymm7, [r8+-64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+32], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [rcx], ymm3 + vmovdqu YMMWORD PTR [r8+-64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r9+-96] + vpxor ymm11, ymm7, [rcx+-32] + vpxor ymm12, ymm8, [r8+64] + vpxor ymm13, ymm9, [r9] + vpxor ymm14, ymm5, [rcx+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-96], ymm0 + vmovdqu YMMWORD PTR [rcx+-32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r9], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+32] + vpxor ymm11, ymm5, [r9+128] + vpxor ymm12, ymm6, [rcx+32] + vpxor ymm13, ymm7, [r8+128] + vpxor ymm14, ymm8, [rcx+-64] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+32], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-64], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+-96] + vpxor ymm11, ymm8, [r8+96] + vpxor ymm12, ymm9, [rcx+-96] + vpxor ymm13, ymm5, [r8] + vpxor ymm14, ymm6, [r9+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-96], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [rcx+-96], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Round 21 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-64] + vpxor ymm11, ymm1, [rcx+-32] + vpxor ymm13, ymm3, [rcx] + vpxor ymm12, ymm2, [rcx+32] + vpxor ymm14, ymm14, [rcx+64] + vpxor ymm11, ymm11, [rcx+96] + vpxor ymm13, ymm13, [rcx+128] + vpxor ymm14, ymm14, [r8+-64] + vpxor ymm11, ymm11, [r8+-32] + vpxor ymm10, ymm10, [r8+32] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm10, ymm10, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm14, ymm14, [r9+-32] + vpxor ymm13, ymm13, [r9] + vpxor ymm10, ymm10, [r9+32] + vpxor ymm12, ymm12, [r9+64] + vpxor ymm11, ymm11, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+96] + vpxor ymm12, ymm7, [r8+64] + vpxor ymm13, ymm8, [r8+128] + vpxor ymm14, ymm9, [r9+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+672] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm3 + vmovdqu YMMWORD PTR [r9+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+128] + vpxor ymm11, ymm9, [r8+-64] + vpxor ymm12, ymm5, [r9+-96] + vpxor ymm13, ymm6, [r9+128] + vpxor ymm14, ymm7, [rcx+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm3 + vmovdqu YMMWORD PTR [rcx+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-32] + vpxor ymm11, ymm7, [r9+-64] + vpxor ymm12, ymm8, [r9] + vpxor ymm13, ymm9, [rcx+-64] + vpxor ymm14, ymm5, [r8+-96] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-32], ymm0 + vmovdqu YMMWORD PTR [r9+-64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+-64], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+-32] + vpxor ymm11, ymm5, [r9+32] + vpxor ymm12, ymm6, [rcx+-32] + vpxor ymm13, ymm7, [rcx+32] + vpxor ymm14, ymm8, [r8] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+-32], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [rcx+-32], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9+64] + vpxor ymm11, ymm8, [rcx] + vpxor ymm12, ymm9, [rcx+64] + vpxor ymm13, ymm5, [r8+32] + vpxor ymm14, ymm6, [r8+96] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+64], ymm0 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Round 22 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm14, ymm4, [rcx+-96] + vpxor ymm13, ymm3, [rcx+-64] + vpxor ymm12, ymm2, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm11, ymm1, [rcx+96] + vpxor ymm10, ymm10, [rcx+128] + vpxor ymm14, ymm14, [r8+-96] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm10, ymm10, [r8+-32] + vpxor ymm14, ymm14, [r8] + vpxor ymm12, ymm12, [r8+64] + vpxor ymm13, ymm13, [r8+128] + vpxor ymm12, ymm12, [r9+-96] + vpxor ymm11, ymm11, [r9+-64] + vpxor ymm10, ymm10, [r9+-32] + vpxor ymm12, ymm12, [r9] + vpxor ymm11, ymm11, [r9+32] + vpxor ymm14, ymm14, [r9+96] + vpxor ymm13, ymm13, [r9+128] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [r8+-64] + vpxor ymm12, ymm7, [r9] + vpxor ymm13, ymm8, [rcx+32] + vpxor ymm14, ymm9, [r8+96] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+704] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm1 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [r8+96], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [r8+128] + vpxor ymm11, ymm9, [rcx+-96] + vpxor ymm12, ymm5, [r8+-32] + vpxor ymm13, ymm6, [r9+32] + vpxor ymm14, ymm7, [rcx+64] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [r8+-32], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [rcx+96] + vpxor ymm11, ymm7, [r9+-96] + vpxor ymm12, ymm8, [rcx+-64] + vpxor ymm13, ymm9, [r8] + vpxor ymm14, ymm5, [r9+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [r9+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [r8], ymm3 + vmovdqu YMMWORD PTR [r9+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r9+96] + vpxor ymm11, ymm5, [rcx+128] + vpxor ymm12, ymm6, [r9+-64] + vpxor ymm13, ymm7, [rcx+-32] + vpxor ymm14, ymm8, [r8+32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9+96], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [r9+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [r8+32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r8+64] + vpxor ymm11, ymm8, [r9+128] + vpxor ymm12, ymm9, [r8+-96] + vpxor ymm13, ymm5, [r9+-32] + vpxor ymm14, ymm6, [rcx] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r9+128], ymm1 + vmovdqu YMMWORD PTR [r8+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Round 23 + ; Calc b[0..4] + vpxor ymm10, ymm0, ymm15 + vpxor ymm11, ymm1, [rcx+-96] + vpxor ymm12, ymm2, [rcx+-64] + vpxor ymm13, ymm3, [rcx+-32] + vpxor ymm13, ymm13, [rcx+32] + vpxor ymm14, ymm4, [rcx+64] + vpxor ymm10, ymm10, [rcx+96] + vpxor ymm11, ymm11, [rcx+128] + vpxor ymm11, ymm11, [r8+-64] + vpxor ymm12, ymm12, [r8+-32] + vpxor ymm13, ymm13, [r8] + vpxor ymm14, ymm14, [r8+32] + vpxor ymm14, ymm14, [r8+96] + vpxor ymm10, ymm10, [r8+128] + vpxor ymm11, ymm11, [r9+-96] + vpxor ymm12, ymm12, [r9+-64] + vpxor ymm12, ymm12, [r9] + vpxor ymm13, ymm13, [r9+32] + vpxor ymm14, ymm14, [r9+64] + vpxor ymm10, ymm10, [r9+96] + ; Calc t[0..4] + vpsrlq ymm0, ymm11, 63 + vpsrlq ymm1, ymm12, 63 + vpsrlq ymm2, ymm13, 63 + vpsrlq ymm3, ymm14, 63 + vpsrlq ymm4, ymm10, 63 + vpaddq ymm5, ymm11, ymm11 + vpaddq ymm6, ymm12, ymm12 + vpaddq ymm7, ymm13, ymm13 + vpaddq ymm8, ymm14, ymm14 + vpaddq ymm9, ymm10, ymm10 + vpor ymm5, ymm5, ymm0 + vpor ymm6, ymm6, ymm1 + vpor ymm7, ymm7, ymm2 + vpor ymm8, ymm8, ymm3 + vpor ymm9, ymm9, ymm4 + vpxor ymm5, ymm5, ymm14 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vpxor ymm8, ymm8, ymm12 + vpxor ymm9, ymm9, ymm13 + ; Row Mix + ; Row 0 + vpxor ymm10, ymm5, ymm15 + vpxor ymm11, ymm6, [rcx+-96] + vpxor ymm12, ymm7, [rcx+-64] + vpxor ymm13, ymm8, [rcx+-32] + vpxor ymm14, ymm9, [rcx] + vpsrlq ymm0, ymm11, 20 + vpsrlq ymm1, ymm12, 21 + vpsrlq ymm2, ymm13, 43 + vpsrlq ymm3, ymm14, 50 + vpsllq ymm11, ymm11, 44 + vpsllq ymm12, ymm12, 43 + vpsllq ymm13, ymm13, 21 + vpsllq ymm14, ymm14, 14 + vpor ymm11, ymm11, ymm0 + vpor ymm12, ymm12, ymm1 + vpor ymm13, ymm13, ymm2 + vpor ymm14, ymm14, ymm3 + vpandn ymm15, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm15, ymm15, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + ; XOR in constant + vpxor ymm15, ymm15, [rax+736] + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+-96], ymm1 + vmovdqu YMMWORD PTR [rcx+-64], ymm2 + vmovdqu YMMWORD PTR [rcx+-32], ymm3 + vmovdqu YMMWORD PTR [rcx], ymm4 + ; Row 1 + vpxor ymm10, ymm8, [rcx+32] + vpxor ymm11, ymm9, [rcx+64] + vpxor ymm12, ymm5, [rcx+96] + vpxor ymm13, ymm6, [rcx+128] + vpxor ymm14, ymm7, [r8+-96] + vpsrlq ymm0, ymm10, 36 + vpsrlq ymm1, ymm11, 44 + vpsrlq ymm2, ymm12, 61 + vpsrlq ymm3, ymm13, 19 + vpsrlq ymm4, ymm14, 3 + vpsllq ymm10, ymm10, 28 + vpsllq ymm11, ymm11, 20 + vpsllq ymm12, ymm12, 3 + vpsllq ymm13, ymm13, 45 + vpsllq ymm14, ymm14, 61 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [rcx+128], ymm3 + vmovdqu YMMWORD PTR [r8+-96], ymm4 + ; Row 2 + vpxor ymm10, ymm6, [r8+-64] + vpxor ymm11, ymm7, [r8+-32] + vpxor ymm12, ymm8, [r8] + vpxor ymm13, ymm9, [r8+32] + vpxor ymm14, ymm5, [r8+64] + vpsrlq ymm0, ymm10, 63 + vpsrlq ymm1, ymm11, 58 + vpsrlq ymm2, ymm12, 39 + vpsrlq ymm3, ymm13, 56 + vpsrlq ymm4, ymm14, 46 + vpaddq ymm10, ymm10, ymm10 + vpsllq ymm11, ymm11, 6 + vpsllq ymm12, ymm12, 25 + vpsllq ymm13, ymm13, 8 + vpsllq ymm14, ymm14, 18 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+-64], ymm0 + vmovdqu YMMWORD PTR [r8+-32], ymm1 + vmovdqu YMMWORD PTR [r8], ymm2 + vmovdqu YMMWORD PTR [r8+32], ymm3 + vmovdqu YMMWORD PTR [r8+64], ymm4 + ; Row 3 + vpxor ymm10, ymm9, [r8+96] + vpxor ymm11, ymm5, [r8+128] + vpxor ymm12, ymm6, [r9+-96] + vpxor ymm13, ymm7, [r9+-64] + vpxor ymm14, ymm8, [r9+-32] + vpsrlq ymm0, ymm10, 37 + vpsrlq ymm1, ymm11, 28 + vpsrlq ymm2, ymm12, 54 + vpsrlq ymm3, ymm13, 49 + vpsrlq ymm4, ymm14, 8 + vpsllq ymm10, ymm10, 27 + vpsllq ymm11, ymm11, 36 + vpsllq ymm12, ymm12, 10 + vpsllq ymm13, ymm13, 15 + vpsllq ymm14, ymm14, 56 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r8+96], ymm0 + vmovdqu YMMWORD PTR [r8+128], ymm1 + vmovdqu YMMWORD PTR [r9+-96], ymm2 + vmovdqu YMMWORD PTR [r9+-64], ymm3 + vmovdqu YMMWORD PTR [r9+-32], ymm4 + ; Row 4 + vpxor ymm10, ymm7, [r9] + vpxor ymm11, ymm8, [r9+32] + vpxor ymm12, ymm9, [r9+64] + vpxor ymm13, ymm5, [r9+96] + vpxor ymm14, ymm6, [r9+128] + vpsrlq ymm0, ymm10, 2 + vpsrlq ymm1, ymm11, 9 + vpsrlq ymm2, ymm12, 25 + vpsrlq ymm3, ymm13, 23 + vpsrlq ymm4, ymm14, 62 + vpsllq ymm10, ymm10, 62 + vpsllq ymm11, ymm11, 55 + vpsllq ymm12, ymm12, 39 + vpsllq ymm13, ymm13, 41 + vpsllq ymm14, ymm14, 2 + vpor ymm10, ymm10, ymm0 + vpor ymm11, ymm11, ymm1 + vpor ymm12, ymm12, ymm2 + vpor ymm13, ymm13, ymm3 + vpor ymm14, ymm14, ymm4 + vpandn ymm0, ymm11, ymm12 + vpandn ymm1, ymm12, ymm13 + vpandn ymm2, ymm13, ymm14 + vpandn ymm3, ymm14, ymm10 + vpandn ymm4, ymm10, ymm11 + vpxor ymm0, ymm0, ymm10 + vpxor ymm1, ymm1, ymm11 + vpxor ymm2, ymm2, ymm12 + vpxor ymm3, ymm3, ymm13 + vpxor ymm4, ymm4, ymm14 + vmovdqu YMMWORD PTR [r9], ymm0 + vmovdqu YMMWORD PTR [r9+32], ymm1 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm3 + vmovdqu YMMWORD PTR [r9+128], ymm4 + sub rcx, 128 + vmovdqu YMMWORD PTR [rcx], ymm15 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +sha3_256_blocksx4_seed_64_avx2 ENDP +_TEXT ENDS +ENDIF +ENDIF +END diff --git a/wolfcrypt/src/sha512_asm.asm b/wolfcrypt/src/sha512_asm.asm new file mode 100644 index 00000000000..07cebd52a70 --- /dev/null +++ b/wolfcrypt/src/sha512_asm.asm @@ -0,0 +1,10774 @@ +; /* sha512_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +IFDEF HAVE_INTEL_AVX1 +_DATA SEGMENT +ALIGN 16 +L_avx1_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h +ptr_L_avx1_sha512_k QWORD L_avx1_sha512_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_avx1_sha512_flip_mask QWORD L_avx1_sha512_flip_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX1 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rdi, rcx + sub rsp, 280 + vmovdqu OWORD PTR [rsp+136], xmm6 + vmovdqu OWORD PTR [rsp+152], xmm7 + vmovdqu OWORD PTR [rsp+168], xmm8 + vmovdqu OWORD PTR [rsp+184], xmm9 + vmovdqu OWORD PTR [rsp+200], xmm10 + vmovdqu OWORD PTR [rsp+216], xmm11 + vmovdqu OWORD PTR [rsp+232], xmm13 + vmovdqu OWORD PTR [rsp+248], xmm12 + vmovdqu OWORD PTR [rsp+264], xmm14 + lea rax, QWORD PTR [rdi+64] + vmovdqa xmm14, OWORD PTR L_avx1_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + vmovdqu xmm0, OWORD PTR [rax] + vmovdqu xmm1, OWORD PTR [rax+16] + vpshufb xmm0, xmm0, xmm14 + vpshufb xmm1, xmm1, xmm14 + vmovdqu xmm2, OWORD PTR [rax+32] + vmovdqu xmm3, OWORD PTR [rax+48] + vpshufb xmm2, xmm2, xmm14 + vpshufb xmm3, xmm3, xmm14 + vmovdqu xmm4, OWORD PTR [rax+64] + vmovdqu xmm5, OWORD PTR [rax+80] + vpshufb xmm4, xmm4, xmm14 + vpshufb xmm5, xmm5, xmm14 + vmovdqu xmm6, OWORD PTR [rax+96] + vmovdqu xmm7, OWORD PTR [rax+112] + vpshufb xmm6, xmm6, xmm14 + vpshufb xmm7, xmm7, xmm14 + mov DWORD PTR [rsp+128], 4 + mov rsi, QWORD PTR [ptr_L_avx1_sha512_k] + mov rbx, r9 + mov rax, r12 + xor rbx, r10 + ; Start of 16 rounds +L_transform_sha512_avx1_start: + vpaddq xmm8, xmm0, [rsi] + vpaddq xmm9, xmm1, [rsi+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rsi+32] + vpaddq xmm9, xmm3, [rsi+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rsi+64] + vpaddq xmm9, xmm5, [rsi+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rsi+96] + vpaddq xmm9, xmm7, [rsi+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + add rsi, 128 + ; msg_sched: 0-1 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm1, xmm0, 8 + vpalignr xmm13, xmm5, xmm4, 8 + ; rnd_0: 1 - 1 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp] + xor rcx, r14 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm0, xmm13, xmm0 + ; rnd_0: 10 - 11 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm0, xmm8, xmm0 + ; rnd_1: 1 - 1 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+8] + xor rcx, r13 + vpsrlq xmm8, xmm7, 19 + vpsllq xmm9, xmm7, 45 + ; rnd_1: 2 - 3 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + vpsrlq xmm10, xmm7, 61 + vpsllq xmm11, xmm7, 3 + ; rnd_1: 4 - 6 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm7, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r10 + add r14, rcx + vpaddq xmm0, xmm8, xmm0 + ; msg_sched done: 0-1 + ; msg_sched: 2-3 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm2, xmm1, 8 + vpalignr xmm13, xmm6, xmm5, 8 + ; rnd_0: 1 - 1 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+16] + xor rcx, r12 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm1, xmm13, xmm1 + ; rnd_0: 10 - 11 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm1, xmm8, xmm1 + ; rnd_1: 1 - 1 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+24] + xor rcx, r11 + vpsrlq xmm8, xmm0, 19 + vpsllq xmm9, xmm0, 45 + ; rnd_1: 2 - 3 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + vpsrlq xmm10, xmm0, 61 + vpsllq xmm11, xmm0, 3 + ; rnd_1: 4 - 6 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm0, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r8 + add r12, rcx + vpaddq xmm1, xmm8, xmm1 + ; msg_sched done: 2-3 + ; msg_sched: 4-5 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm3, xmm2, 8 + vpalignr xmm13, xmm7, xmm6, 8 + ; rnd_0: 1 - 1 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+32] + xor rcx, r10 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm2, xmm13, xmm2 + ; rnd_0: 10 - 11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm2, xmm8, xmm2 + ; rnd_1: 1 - 1 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+40] + xor rcx, r9 + vpsrlq xmm8, xmm1, 19 + vpsllq xmm9, xmm1, 45 + ; rnd_1: 2 - 3 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + vpsrlq xmm10, xmm1, 61 + vpsllq xmm11, xmm1, 3 + ; rnd_1: 4 - 6 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm1, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r14 + add r10, rcx + vpaddq xmm2, xmm8, xmm2 + ; msg_sched done: 4-5 + ; msg_sched: 6-7 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm4, xmm3, 8 + vpalignr xmm13, xmm0, xmm7, 8 + ; rnd_0: 1 - 1 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+48] + xor rcx, r8 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm3, xmm13, xmm3 + ; rnd_0: 10 - 11 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm3, xmm8, xmm3 + ; rnd_1: 1 - 1 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+56] + xor rcx, r15 + vpsrlq xmm8, xmm2, 19 + vpsllq xmm9, xmm2, 45 + ; rnd_1: 2 - 3 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + vpsrlq xmm10, xmm2, 61 + vpsllq xmm11, xmm2, 3 + ; rnd_1: 4 - 6 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm2, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r12 + add r8, rcx + vpaddq xmm3, xmm8, xmm3 + ; msg_sched done: 6-7 + ; msg_sched: 8-9 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm5, xmm4, 8 + vpalignr xmm13, xmm1, xmm0, 8 + ; rnd_0: 1 - 1 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp+64] + xor rcx, r14 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm4, xmm13, xmm4 + ; rnd_0: 10 - 11 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm4, xmm8, xmm4 + ; rnd_1: 1 - 1 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+72] + xor rcx, r13 + vpsrlq xmm8, xmm3, 19 + vpsllq xmm9, xmm3, 45 + ; rnd_1: 2 - 3 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + vpsrlq xmm10, xmm3, 61 + vpsllq xmm11, xmm3, 3 + ; rnd_1: 4 - 6 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm3, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r10 + add r14, rcx + vpaddq xmm4, xmm8, xmm4 + ; msg_sched done: 8-9 + ; msg_sched: 10-11 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm6, xmm5, 8 + vpalignr xmm13, xmm2, xmm1, 8 + ; rnd_0: 1 - 1 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+80] + xor rcx, r12 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm5, xmm13, xmm5 + ; rnd_0: 10 - 11 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm5, xmm8, xmm5 + ; rnd_1: 1 - 1 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+88] + xor rcx, r11 + vpsrlq xmm8, xmm4, 19 + vpsllq xmm9, xmm4, 45 + ; rnd_1: 2 - 3 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + vpsrlq xmm10, xmm4, 61 + vpsllq xmm11, xmm4, 3 + ; rnd_1: 4 - 6 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm4, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r8 + add r12, rcx + vpaddq xmm5, xmm8, xmm5 + ; msg_sched done: 10-11 + ; msg_sched: 12-13 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm7, xmm6, 8 + vpalignr xmm13, xmm3, xmm2, 8 + ; rnd_0: 1 - 1 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+96] + xor rcx, r10 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm6, xmm13, xmm6 + ; rnd_0: 10 - 11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm6, xmm8, xmm6 + ; rnd_1: 1 - 1 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+104] + xor rcx, r9 + vpsrlq xmm8, xmm5, 19 + vpsllq xmm9, xmm5, 45 + ; rnd_1: 2 - 3 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + vpsrlq xmm10, xmm5, 61 + vpsllq xmm11, xmm5, 3 + ; rnd_1: 4 - 6 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm5, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r14 + add r10, rcx + vpaddq xmm6, xmm8, xmm6 + ; msg_sched done: 12-13 + ; msg_sched: 14-15 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm0, xmm7, 8 + vpalignr xmm13, xmm4, xmm3, 8 + ; rnd_0: 1 - 1 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+112] + xor rcx, r8 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm7, xmm13, xmm7 + ; rnd_0: 10 - 11 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm7, xmm8, xmm7 + ; rnd_1: 1 - 1 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+120] + xor rcx, r15 + vpsrlq xmm8, xmm6, 19 + vpsllq xmm9, xmm6, 45 + ; rnd_1: 2 - 3 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + vpsrlq xmm10, xmm6, 61 + vpsllq xmm11, xmm6, 3 + ; rnd_1: 4 - 6 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm6, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r12 + add r8, rcx + vpaddq xmm7, xmm8, xmm7 + ; msg_sched done: 14-15 + sub DWORD PTR [rsp+128], 1 + jne L_transform_sha512_avx1_start + vpaddq xmm8, xmm0, [rsi] + vpaddq xmm9, xmm1, [rsi+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rsi+32] + vpaddq xmm9, xmm3, [rsi+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rsi+64] + vpaddq xmm9, xmm5, [rsi+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rsi+96] + vpaddq xmm9, xmm7, [rsi+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + ; rnd_all_2: 0-1 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+8] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ; rnd_all_2: 2-3 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+16] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+24] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ; rnd_all_2: 4-5 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+32] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+40] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ; rnd_all_2: 6-7 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+48] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+56] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + ; rnd_all_2: 8-9 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp+64] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+72] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ; rnd_all_2: 10-11 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+80] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+88] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ; rnd_all_2: 12-13 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+96] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+104] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ; rnd_all_2: 14-15 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+112] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+120] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + add QWORD PTR [rdi], r8 + add QWORD PTR [rdi+8], r9 + add QWORD PTR [rdi+16], r10 + add QWORD PTR [rdi+24], r11 + add QWORD PTR [rdi+32], r12 + add QWORD PTR [rdi+40], r13 + add QWORD PTR [rdi+48], r14 + add QWORD PTR [rdi+56], r15 + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+136] + vmovdqu xmm7, OWORD PTR [rsp+152] + vmovdqu xmm8, OWORD PTR [rsp+168] + vmovdqu xmm9, OWORD PTR [rsp+184] + vmovdqu xmm10, OWORD PTR [rsp+200] + vmovdqu xmm11, OWORD PTR [rsp+216] + vmovdqu xmm13, OWORD PTR [rsp+232] + vmovdqu xmm12, OWORD PTR [rsp+248] + vmovdqu xmm14, OWORD PTR [rsp+264] + add rsp, 280 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX1 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX1_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rbp, rdx + sub rsp, 288 + vmovdqu OWORD PTR [rsp+144], xmm6 + vmovdqu OWORD PTR [rsp+160], xmm7 + vmovdqu OWORD PTR [rsp+176], xmm8 + vmovdqu OWORD PTR [rsp+192], xmm9 + vmovdqu OWORD PTR [rsp+208], xmm10 + vmovdqu OWORD PTR [rsp+224], xmm11 + vmovdqu OWORD PTR [rsp+240], xmm13 + vmovdqu OWORD PTR [rsp+256], xmm12 + vmovdqu OWORD PTR [rsp+272], xmm14 + mov rsi, QWORD PTR [rdi+224] + mov rdx, QWORD PTR [ptr_L_avx1_sha512_k] + vmovdqa xmm14, OWORD PTR L_avx1_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + ; Start of loop processing a block +L_sha512_len_avx1_begin: + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm14 + vpshufb xmm1, xmm1, xmm14 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm14 + vpshufb xmm3, xmm3, xmm14 + vmovdqu xmm4, OWORD PTR [rsi+64] + vmovdqu xmm5, OWORD PTR [rsi+80] + vpshufb xmm4, xmm4, xmm14 + vpshufb xmm5, xmm5, xmm14 + vmovdqu xmm6, OWORD PTR [rsi+96] + vmovdqu xmm7, OWORD PTR [rsi+112] + vpshufb xmm6, xmm6, xmm14 + vpshufb xmm7, xmm7, xmm14 + mov DWORD PTR [rsp+128], 4 + mov rbx, r9 + mov rax, r12 + xor rbx, r10 + vpaddq xmm8, xmm0, [rdx] + vpaddq xmm9, xmm1, [rdx+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rdx+32] + vpaddq xmm9, xmm3, [rdx+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rdx+64] + vpaddq xmm9, xmm5, [rdx+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rdx+96] + vpaddq xmm9, xmm7, [rdx+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + ; Start of 16 rounds +L_sha512_len_avx1_start: + add rdx, 128 + mov QWORD PTR [rsp+136], rdx + ; msg_sched: 0-1 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm1, xmm0, 8 + vpalignr xmm13, xmm5, xmm4, 8 + ; rnd_0: 1 - 1 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp] + xor rcx, r14 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm0, xmm13, xmm0 + ; rnd_0: 10 - 11 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm0, xmm8, xmm0 + ; rnd_1: 1 - 1 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+8] + xor rcx, r13 + vpsrlq xmm8, xmm7, 19 + vpsllq xmm9, xmm7, 45 + ; rnd_1: 2 - 3 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + vpsrlq xmm10, xmm7, 61 + vpsllq xmm11, xmm7, 3 + ; rnd_1: 4 - 6 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm7, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r10 + add r14, rcx + vpaddq xmm0, xmm8, xmm0 + ; msg_sched done: 0-1 + ; msg_sched: 2-3 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm2, xmm1, 8 + vpalignr xmm13, xmm6, xmm5, 8 + ; rnd_0: 1 - 1 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+16] + xor rcx, r12 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm1, xmm13, xmm1 + ; rnd_0: 10 - 11 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm1, xmm8, xmm1 + ; rnd_1: 1 - 1 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+24] + xor rcx, r11 + vpsrlq xmm8, xmm0, 19 + vpsllq xmm9, xmm0, 45 + ; rnd_1: 2 - 3 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + vpsrlq xmm10, xmm0, 61 + vpsllq xmm11, xmm0, 3 + ; rnd_1: 4 - 6 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm0, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r8 + add r12, rcx + vpaddq xmm1, xmm8, xmm1 + ; msg_sched done: 2-3 + ; msg_sched: 4-5 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm3, xmm2, 8 + vpalignr xmm13, xmm7, xmm6, 8 + ; rnd_0: 1 - 1 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+32] + xor rcx, r10 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm2, xmm13, xmm2 + ; rnd_0: 10 - 11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm2, xmm8, xmm2 + ; rnd_1: 1 - 1 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+40] + xor rcx, r9 + vpsrlq xmm8, xmm1, 19 + vpsllq xmm9, xmm1, 45 + ; rnd_1: 2 - 3 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + vpsrlq xmm10, xmm1, 61 + vpsllq xmm11, xmm1, 3 + ; rnd_1: 4 - 6 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm1, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r14 + add r10, rcx + vpaddq xmm2, xmm8, xmm2 + ; msg_sched done: 4-5 + ; msg_sched: 6-7 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm4, xmm3, 8 + vpalignr xmm13, xmm0, xmm7, 8 + ; rnd_0: 1 - 1 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+48] + xor rcx, r8 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm3, xmm13, xmm3 + ; rnd_0: 10 - 11 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm3, xmm8, xmm3 + ; rnd_1: 1 - 1 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+56] + xor rcx, r15 + vpsrlq xmm8, xmm2, 19 + vpsllq xmm9, xmm2, 45 + ; rnd_1: 2 - 3 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + vpsrlq xmm10, xmm2, 61 + vpsllq xmm11, xmm2, 3 + ; rnd_1: 4 - 6 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm2, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r12 + add r8, rcx + vpaddq xmm3, xmm8, xmm3 + ; msg_sched done: 6-7 + ; msg_sched: 8-9 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm5, xmm4, 8 + vpalignr xmm13, xmm1, xmm0, 8 + ; rnd_0: 1 - 1 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp+64] + xor rcx, r14 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm4, xmm13, xmm4 + ; rnd_0: 10 - 11 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm4, xmm8, xmm4 + ; rnd_1: 1 - 1 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+72] + xor rcx, r13 + vpsrlq xmm8, xmm3, 19 + vpsllq xmm9, xmm3, 45 + ; rnd_1: 2 - 3 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + vpsrlq xmm10, xmm3, 61 + vpsllq xmm11, xmm3, 3 + ; rnd_1: 4 - 6 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm3, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r10 + add r14, rcx + vpaddq xmm4, xmm8, xmm4 + ; msg_sched done: 8-9 + ; msg_sched: 10-11 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm6, xmm5, 8 + vpalignr xmm13, xmm2, xmm1, 8 + ; rnd_0: 1 - 1 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+80] + xor rcx, r12 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm5, xmm13, xmm5 + ; rnd_0: 10 - 11 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm5, xmm8, xmm5 + ; rnd_1: 1 - 1 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+88] + xor rcx, r11 + vpsrlq xmm8, xmm4, 19 + vpsllq xmm9, xmm4, 45 + ; rnd_1: 2 - 3 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + vpsrlq xmm10, xmm4, 61 + vpsllq xmm11, xmm4, 3 + ; rnd_1: 4 - 6 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm4, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r8 + add r12, rcx + vpaddq xmm5, xmm8, xmm5 + ; msg_sched done: 10-11 + ; msg_sched: 12-13 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm7, xmm6, 8 + vpalignr xmm13, xmm3, xmm2, 8 + ; rnd_0: 1 - 1 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+96] + xor rcx, r10 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm6, xmm13, xmm6 + ; rnd_0: 10 - 11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm6, xmm8, xmm6 + ; rnd_1: 1 - 1 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+104] + xor rcx, r9 + vpsrlq xmm8, xmm5, 19 + vpsllq xmm9, xmm5, 45 + ; rnd_1: 2 - 3 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + vpsrlq xmm10, xmm5, 61 + vpsllq xmm11, xmm5, 3 + ; rnd_1: 4 - 6 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm5, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r14 + add r10, rcx + vpaddq xmm6, xmm8, xmm6 + ; msg_sched done: 12-13 + ; msg_sched: 14-15 + ; rnd_0: 0 - 0 + ror rax, 23 + vpalignr xmm12, xmm0, xmm7, 8 + vpalignr xmm13, xmm4, xmm3, 8 + ; rnd_0: 1 - 1 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+112] + xor rcx, r8 + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 3 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 4 - 5 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 6 - 7 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 8 - 9 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpxor xmm8, xmm8, xmm11 + vpaddq xmm7, xmm13, xmm7 + ; rnd_0: 10 - 11 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 0 + ror rax, 23 + vpaddq xmm7, xmm8, xmm7 + ; rnd_1: 1 - 1 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+120] + xor rcx, r15 + vpsrlq xmm8, xmm6, 19 + vpsllq xmm9, xmm6, 45 + ; rnd_1: 2 - 3 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + vpsrlq xmm10, xmm6, 61 + vpsllq xmm11, xmm6, 3 + ; rnd_1: 4 - 6 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 7 - 8 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm6, 6 + ; rnd_1: 9 - 10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 11 - 11 + ror rcx, 28 + mov rax, r12 + add r8, rcx + vpaddq xmm7, xmm8, xmm7 + ; msg_sched done: 14-15 + mov rdx, QWORD PTR [rsp+136] + vpaddq xmm8, xmm0, [rdx] + vpaddq xmm9, xmm1, [rdx+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rdx+32] + vpaddq xmm9, xmm3, [rdx+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rdx+64] + vpaddq xmm9, xmm5, [rdx+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rdx+96] + vpaddq xmm9, xmm7, [rdx+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + sub DWORD PTR [rsp+128], 1 + jne L_sha512_len_avx1_start + ; rnd_all_2: 0-1 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+8] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ; rnd_all_2: 2-3 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+16] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+24] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ; rnd_all_2: 4-5 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+32] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+40] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ; rnd_all_2: 6-7 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+48] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+56] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + ; rnd_all_2: 8-9 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp+64] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+72] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ; rnd_all_2: 10-11 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+80] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+88] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ; rnd_all_2: 12-13 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+96] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+104] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ; rnd_all_2: 14-15 + ; rnd_0: 0 - 11 + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+112] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ; rnd_1: 0 - 11 + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+120] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + add r8, QWORD PTR [rdi] + add r9, QWORD PTR [rdi+8] + add r10, QWORD PTR [rdi+16] + add r11, QWORD PTR [rdi+24] + add r12, QWORD PTR [rdi+32] + add r13, QWORD PTR [rdi+40] + add r14, QWORD PTR [rdi+48] + add r15, QWORD PTR [rdi+56] + mov rdx, QWORD PTR [ptr_L_avx1_sha512_k] + add rsi, 128 + sub ebp, 128 + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + mov QWORD PTR [rdi+32], r12 + mov QWORD PTR [rdi+40], r13 + mov QWORD PTR [rdi+48], r14 + mov QWORD PTR [rdi+56], r15 + jnz L_sha512_len_avx1_begin + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+144] + vmovdqu xmm7, OWORD PTR [rsp+160] + vmovdqu xmm8, OWORD PTR [rsp+176] + vmovdqu xmm9, OWORD PTR [rsp+192] + vmovdqu xmm10, OWORD PTR [rsp+208] + vmovdqu xmm11, OWORD PTR [rsp+224] + vmovdqu xmm13, OWORD PTR [rsp+240] + vmovdqu xmm12, OWORD PTR [rsp+256] + vmovdqu xmm14, OWORD PTR [rsp+272] + add rsp, 288 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX1_Len ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_rorx_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h +ptr_L_avx1_rorx_sha512_k QWORD L_avx1_rorx_sha512_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_rorx_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_avx1_rorx_sha512_flip_mask QWORD L_avx1_rorx_sha512_flip_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX1_RORX PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rdi, rcx + sub rsp, 280 + vmovdqu OWORD PTR [rsp+136], xmm6 + vmovdqu OWORD PTR [rsp+152], xmm7 + vmovdqu OWORD PTR [rsp+168], xmm8 + vmovdqu OWORD PTR [rsp+184], xmm9 + vmovdqu OWORD PTR [rsp+200], xmm10 + vmovdqu OWORD PTR [rsp+216], xmm11 + vmovdqu OWORD PTR [rsp+232], xmm13 + vmovdqu OWORD PTR [rsp+248], xmm12 + vmovdqu OWORD PTR [rsp+264], xmm14 + lea rax, QWORD PTR [rdi+64] + vmovdqa xmm14, OWORD PTR L_avx1_rorx_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + vmovdqu xmm0, OWORD PTR [rax] + vmovdqu xmm1, OWORD PTR [rax+16] + vpshufb xmm0, xmm0, xmm14 + vpshufb xmm1, xmm1, xmm14 + vmovdqu xmm2, OWORD PTR [rax+32] + vmovdqu xmm3, OWORD PTR [rax+48] + vpshufb xmm2, xmm2, xmm14 + vpshufb xmm3, xmm3, xmm14 + vmovdqu xmm4, OWORD PTR [rax+64] + vmovdqu xmm5, OWORD PTR [rax+80] + vpshufb xmm4, xmm4, xmm14 + vpshufb xmm5, xmm5, xmm14 + vmovdqu xmm6, OWORD PTR [rax+96] + vmovdqu xmm7, OWORD PTR [rax+112] + vpshufb xmm6, xmm6, xmm14 + vpshufb xmm7, xmm7, xmm14 + mov DWORD PTR [rsp+128], 4 + mov rsi, QWORD PTR [ptr_L_avx1_rorx_sha512_k] + mov rbx, r9 + xor rdx, rdx + xor rbx, r10 + vpaddq xmm8, xmm0, [rsi] + vpaddq xmm9, xmm1, [rsi+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rsi+32] + vpaddq xmm9, xmm3, [rsi+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rsi+64] + vpaddq xmm9, xmm5, [rsi+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rsi+96] + vpaddq xmm9, xmm7, [rsi+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + ; Start of 16 rounds +L_transform_sha512_avx1_rorx_start: + add rsi, 128 + ; msg_sched: 0-1 + ; rnd_0: 0 - 0 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpalignr xmm12, xmm1, xmm0, 8 + vpalignr xmm13, xmm5, xmm4, 8 + ; rnd_0: 1 - 1 + add r15, QWORD PTR [rsp] + mov rdx, r13 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm0, xmm13, xmm0 + ; rnd_0: 6 - 7 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + vpaddq xmm0, xmm8, xmm0 + ; rnd_1: 0 - 0 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpsrlq xmm8, xmm7, 19 + vpsllq xmm9, xmm7, 45 + ; rnd_1: 1 - 1 + add r14, QWORD PTR [rsp+8] + mov rbx, r12 + xor rcx, rax + vpsrlq xmm10, xmm7, 61 + vpsllq xmm11, xmm7, 3 + ; rnd_1: 2 - 2 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm7, 6 + ; rnd_1: 5 - 6 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpaddq xmm0, xmm8, xmm0 + ; msg_sched done: 0-1 + ; msg_sched: 2-3 + ; rnd_0: 0 - 0 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpalignr xmm12, xmm2, xmm1, 8 + vpalignr xmm13, xmm6, xmm5, 8 + ; rnd_0: 1 - 1 + add r13, QWORD PTR [rsp+16] + mov rdx, r11 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm1, xmm13, xmm1 + ; rnd_0: 6 - 7 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpaddq xmm1, xmm8, xmm1 + ; rnd_1: 0 - 0 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq xmm8, xmm0, 19 + vpsllq xmm9, xmm0, 45 + ; rnd_1: 1 - 1 + add r12, QWORD PTR [rsp+24] + mov rbx, r10 + xor rcx, rax + vpsrlq xmm10, xmm0, 61 + vpsllq xmm11, xmm0, 3 + ; rnd_1: 2 - 2 + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm0, 6 + ; rnd_1: 5 - 6 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r12, rax + xor rdx, r14 + vpaddq xmm1, xmm8, xmm1 + ; msg_sched done: 2-3 + ; msg_sched: 4-5 + ; rnd_0: 0 - 0 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpalignr xmm12, xmm3, xmm2, 8 + vpalignr xmm13, xmm7, xmm6, 8 + ; rnd_0: 1 - 1 + add r11, QWORD PTR [rsp+32] + mov rdx, r9 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm2, xmm13, xmm2 + ; rnd_0: 6 - 7 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + vpaddq xmm2, xmm8, xmm2 + ; rnd_1: 0 - 0 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpsrlq xmm8, xmm1, 19 + vpsllq xmm9, xmm1, 45 + ; rnd_1: 1 - 1 + add r10, QWORD PTR [rsp+40] + mov rbx, r8 + xor rcx, rax + vpsrlq xmm10, xmm1, 61 + vpsllq xmm11, xmm1, 3 + ; rnd_1: 2 - 2 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm1, 6 + ; rnd_1: 5 - 6 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpaddq xmm2, xmm8, xmm2 + ; msg_sched done: 4-5 + ; msg_sched: 6-7 + ; rnd_0: 0 - 0 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpalignr xmm12, xmm4, xmm3, 8 + vpalignr xmm13, xmm0, xmm7, 8 + ; rnd_0: 1 - 1 + add r9, QWORD PTR [rsp+48] + mov rdx, r15 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm3, xmm13, xmm3 + ; rnd_0: 6 - 7 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpaddq xmm3, xmm8, xmm3 + ; rnd_1: 0 - 0 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq xmm8, xmm2, 19 + vpsllq xmm9, xmm2, 45 + ; rnd_1: 1 - 1 + add r8, QWORD PTR [rsp+56] + mov rbx, r14 + xor rcx, rax + vpsrlq xmm10, xmm2, 61 + vpsllq xmm11, xmm2, 3 + ; rnd_1: 2 - 2 + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm2, 6 + ; rnd_1: 5 - 6 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r8, rax + xor rdx, r10 + vpaddq xmm3, xmm8, xmm3 + ; msg_sched done: 6-7 + ; msg_sched: 8-9 + ; rnd_0: 0 - 0 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpalignr xmm12, xmm5, xmm4, 8 + vpalignr xmm13, xmm1, xmm0, 8 + ; rnd_0: 1 - 1 + add r15, QWORD PTR [rsp+64] + mov rdx, r13 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm4, xmm13, xmm4 + ; rnd_0: 6 - 7 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + vpaddq xmm4, xmm8, xmm4 + ; rnd_1: 0 - 0 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpsrlq xmm8, xmm3, 19 + vpsllq xmm9, xmm3, 45 + ; rnd_1: 1 - 1 + add r14, QWORD PTR [rsp+72] + mov rbx, r12 + xor rcx, rax + vpsrlq xmm10, xmm3, 61 + vpsllq xmm11, xmm3, 3 + ; rnd_1: 2 - 2 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm3, 6 + ; rnd_1: 5 - 6 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpaddq xmm4, xmm8, xmm4 + ; msg_sched done: 8-9 + ; msg_sched: 10-11 + ; rnd_0: 0 - 0 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpalignr xmm12, xmm6, xmm5, 8 + vpalignr xmm13, xmm2, xmm1, 8 + ; rnd_0: 1 - 1 + add r13, QWORD PTR [rsp+80] + mov rdx, r11 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm5, xmm13, xmm5 + ; rnd_0: 6 - 7 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpaddq xmm5, xmm8, xmm5 + ; rnd_1: 0 - 0 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq xmm8, xmm4, 19 + vpsllq xmm9, xmm4, 45 + ; rnd_1: 1 - 1 + add r12, QWORD PTR [rsp+88] + mov rbx, r10 + xor rcx, rax + vpsrlq xmm10, xmm4, 61 + vpsllq xmm11, xmm4, 3 + ; rnd_1: 2 - 2 + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm4, 6 + ; rnd_1: 5 - 6 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r12, rax + xor rdx, r14 + vpaddq xmm5, xmm8, xmm5 + ; msg_sched done: 10-11 + ; msg_sched: 12-13 + ; rnd_0: 0 - 0 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpalignr xmm12, xmm7, xmm6, 8 + vpalignr xmm13, xmm3, xmm2, 8 + ; rnd_0: 1 - 1 + add r11, QWORD PTR [rsp+96] + mov rdx, r9 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm6, xmm13, xmm6 + ; rnd_0: 6 - 7 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + vpaddq xmm6, xmm8, xmm6 + ; rnd_1: 0 - 0 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpsrlq xmm8, xmm5, 19 + vpsllq xmm9, xmm5, 45 + ; rnd_1: 1 - 1 + add r10, QWORD PTR [rsp+104] + mov rbx, r8 + xor rcx, rax + vpsrlq xmm10, xmm5, 61 + vpsllq xmm11, xmm5, 3 + ; rnd_1: 2 - 2 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm5, 6 + ; rnd_1: 5 - 6 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpaddq xmm6, xmm8, xmm6 + ; msg_sched done: 12-13 + ; msg_sched: 14-15 + ; rnd_0: 0 - 0 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpalignr xmm12, xmm0, xmm7, 8 + vpalignr xmm13, xmm4, xmm3, 8 + ; rnd_0: 1 - 1 + add r9, QWORD PTR [rsp+112] + mov rdx, r15 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm7, xmm13, xmm7 + ; rnd_0: 6 - 7 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpaddq xmm7, xmm8, xmm7 + ; rnd_1: 0 - 0 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq xmm8, xmm6, 19 + vpsllq xmm9, xmm6, 45 + ; rnd_1: 1 - 1 + add r8, QWORD PTR [rsp+120] + mov rbx, r14 + xor rcx, rax + vpsrlq xmm10, xmm6, 61 + vpsllq xmm11, xmm6, 3 + ; rnd_1: 2 - 2 + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm6, 6 + ; rnd_1: 5 - 6 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r8, rax + xor rdx, r10 + vpaddq xmm7, xmm8, xmm7 + ; msg_sched done: 14-15 + vpaddq xmm8, xmm0, [rsi] + vpaddq xmm9, xmm1, [rsi+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rsi+32] + vpaddq xmm9, xmm3, [rsi+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rsi+64] + vpaddq xmm9, xmm5, [rsi+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rsi+96] + vpaddq xmm9, xmm7, [rsi+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + sub DWORD PTR [rsp+128], 1 + jne L_transform_sha512_avx1_rorx_start + ; rnd_all_2: 0-1 + ; rnd_0: 0 - 7 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsp] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + ; rnd_1: 0 - 7 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsp+8] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 2-3 + ; rnd_0: 0 - 7 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsp+16] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + ; rnd_1: 0 - 7 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsp+24] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 4-5 + ; rnd_0: 0 - 7 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsp+32] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + ; rnd_1: 0 - 7 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsp+40] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 6-7 + ; rnd_0: 0 - 7 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsp+48] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + ; rnd_1: 0 - 7 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsp+56] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + ; rnd_all_2: 8-9 + ; rnd_0: 0 - 7 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsp+64] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + ; rnd_1: 0 - 7 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsp+72] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 10-11 + ; rnd_0: 0 - 7 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsp+80] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + ; rnd_1: 0 - 7 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsp+88] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 12-13 + ; rnd_0: 0 - 7 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsp+96] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + ; rnd_1: 0 - 7 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsp+104] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 14-15 + ; rnd_0: 0 - 7 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsp+112] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + ; rnd_1: 0 - 7 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsp+120] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + add r8, rdx + add QWORD PTR [rdi], r8 + add QWORD PTR [rdi+8], r9 + add QWORD PTR [rdi+16], r10 + add QWORD PTR [rdi+24], r11 + add QWORD PTR [rdi+32], r12 + add QWORD PTR [rdi+40], r13 + add QWORD PTR [rdi+48], r14 + add QWORD PTR [rdi+56], r15 + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+136] + vmovdqu xmm7, OWORD PTR [rsp+152] + vmovdqu xmm8, OWORD PTR [rsp+168] + vmovdqu xmm9, OWORD PTR [rsp+184] + vmovdqu xmm10, OWORD PTR [rsp+200] + vmovdqu xmm11, OWORD PTR [rsp+216] + vmovdqu xmm13, OWORD PTR [rsp+232] + vmovdqu xmm12, OWORD PTR [rsp+248] + vmovdqu xmm14, OWORD PTR [rsp+264] + add rsp, 280 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX1_RORX ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX1_RORX_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rbp, rdx + sub rsp, 288 + vmovdqu OWORD PTR [rsp+144], xmm6 + vmovdqu OWORD PTR [rsp+160], xmm7 + vmovdqu OWORD PTR [rsp+176], xmm8 + vmovdqu OWORD PTR [rsp+192], xmm9 + vmovdqu OWORD PTR [rsp+208], xmm10 + vmovdqu OWORD PTR [rsp+224], xmm11 + vmovdqu OWORD PTR [rsp+240], xmm13 + vmovdqu OWORD PTR [rsp+256], xmm12 + vmovdqu OWORD PTR [rsp+272], xmm14 + mov rsi, QWORD PTR [rdi+224] + mov rcx, QWORD PTR [ptr_L_avx1_rorx_sha512_k] + vmovdqa xmm14, OWORD PTR L_avx1_rorx_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + ; Start of loop processing a block +L_sha512_len_avx1_rorx_begin: + vmovdqu xmm0, OWORD PTR [rsi] + vmovdqu xmm1, OWORD PTR [rsi+16] + vpshufb xmm0, xmm0, xmm14 + vpshufb xmm1, xmm1, xmm14 + vmovdqu xmm2, OWORD PTR [rsi+32] + vmovdqu xmm3, OWORD PTR [rsi+48] + vpshufb xmm2, xmm2, xmm14 + vpshufb xmm3, xmm3, xmm14 + vmovdqu xmm4, OWORD PTR [rsi+64] + vmovdqu xmm5, OWORD PTR [rsi+80] + vpshufb xmm4, xmm4, xmm14 + vpshufb xmm5, xmm5, xmm14 + vmovdqu xmm6, OWORD PTR [rsi+96] + vmovdqu xmm7, OWORD PTR [rsi+112] + vpshufb xmm6, xmm6, xmm14 + vpshufb xmm7, xmm7, xmm14 + mov DWORD PTR [rsp+128], 4 + mov rbx, r9 + xor rdx, rdx + xor rbx, r10 + vpaddq xmm8, xmm0, [rcx] + vpaddq xmm9, xmm1, [rcx+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rcx+32] + vpaddq xmm9, xmm3, [rcx+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rcx+64] + vpaddq xmm9, xmm5, [rcx+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rcx+96] + vpaddq xmm9, xmm7, [rcx+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + ; Start of 16 rounds +L_sha512_len_avx1_rorx_start: + add rcx, 128 + mov QWORD PTR [rsp+136], rcx + ; msg_sched: 0-1 + ; rnd_0: 0 - 0 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpalignr xmm12, xmm1, xmm0, 8 + vpalignr xmm13, xmm5, xmm4, 8 + ; rnd_0: 1 - 1 + add r15, QWORD PTR [rsp] + mov rdx, r13 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm0, xmm13, xmm0 + ; rnd_0: 6 - 7 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + vpaddq xmm0, xmm8, xmm0 + ; rnd_1: 0 - 0 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpsrlq xmm8, xmm7, 19 + vpsllq xmm9, xmm7, 45 + ; rnd_1: 1 - 1 + add r14, QWORD PTR [rsp+8] + mov rbx, r12 + xor rcx, rax + vpsrlq xmm10, xmm7, 61 + vpsllq xmm11, xmm7, 3 + ; rnd_1: 2 - 2 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm7, 6 + ; rnd_1: 5 - 6 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpaddq xmm0, xmm8, xmm0 + ; msg_sched done: 0-1 + ; msg_sched: 2-3 + ; rnd_0: 0 - 0 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpalignr xmm12, xmm2, xmm1, 8 + vpalignr xmm13, xmm6, xmm5, 8 + ; rnd_0: 1 - 1 + add r13, QWORD PTR [rsp+16] + mov rdx, r11 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm1, xmm13, xmm1 + ; rnd_0: 6 - 7 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpaddq xmm1, xmm8, xmm1 + ; rnd_1: 0 - 0 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq xmm8, xmm0, 19 + vpsllq xmm9, xmm0, 45 + ; rnd_1: 1 - 1 + add r12, QWORD PTR [rsp+24] + mov rbx, r10 + xor rcx, rax + vpsrlq xmm10, xmm0, 61 + vpsllq xmm11, xmm0, 3 + ; rnd_1: 2 - 2 + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm0, 6 + ; rnd_1: 5 - 6 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r12, rax + xor rdx, r14 + vpaddq xmm1, xmm8, xmm1 + ; msg_sched done: 2-3 + ; msg_sched: 4-5 + ; rnd_0: 0 - 0 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpalignr xmm12, xmm3, xmm2, 8 + vpalignr xmm13, xmm7, xmm6, 8 + ; rnd_0: 1 - 1 + add r11, QWORD PTR [rsp+32] + mov rdx, r9 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm2, xmm13, xmm2 + ; rnd_0: 6 - 7 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + vpaddq xmm2, xmm8, xmm2 + ; rnd_1: 0 - 0 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpsrlq xmm8, xmm1, 19 + vpsllq xmm9, xmm1, 45 + ; rnd_1: 1 - 1 + add r10, QWORD PTR [rsp+40] + mov rbx, r8 + xor rcx, rax + vpsrlq xmm10, xmm1, 61 + vpsllq xmm11, xmm1, 3 + ; rnd_1: 2 - 2 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm1, 6 + ; rnd_1: 5 - 6 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpaddq xmm2, xmm8, xmm2 + ; msg_sched done: 4-5 + ; msg_sched: 6-7 + ; rnd_0: 0 - 0 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpalignr xmm12, xmm4, xmm3, 8 + vpalignr xmm13, xmm0, xmm7, 8 + ; rnd_0: 1 - 1 + add r9, QWORD PTR [rsp+48] + mov rdx, r15 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm3, xmm13, xmm3 + ; rnd_0: 6 - 7 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpaddq xmm3, xmm8, xmm3 + ; rnd_1: 0 - 0 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq xmm8, xmm2, 19 + vpsllq xmm9, xmm2, 45 + ; rnd_1: 1 - 1 + add r8, QWORD PTR [rsp+56] + mov rbx, r14 + xor rcx, rax + vpsrlq xmm10, xmm2, 61 + vpsllq xmm11, xmm2, 3 + ; rnd_1: 2 - 2 + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm2, 6 + ; rnd_1: 5 - 6 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r8, rax + xor rdx, r10 + vpaddq xmm3, xmm8, xmm3 + ; msg_sched done: 6-7 + ; msg_sched: 8-9 + ; rnd_0: 0 - 0 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpalignr xmm12, xmm5, xmm4, 8 + vpalignr xmm13, xmm1, xmm0, 8 + ; rnd_0: 1 - 1 + add r15, QWORD PTR [rsp+64] + mov rdx, r13 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm4, xmm13, xmm4 + ; rnd_0: 6 - 7 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + vpaddq xmm4, xmm8, xmm4 + ; rnd_1: 0 - 0 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpsrlq xmm8, xmm3, 19 + vpsllq xmm9, xmm3, 45 + ; rnd_1: 1 - 1 + add r14, QWORD PTR [rsp+72] + mov rbx, r12 + xor rcx, rax + vpsrlq xmm10, xmm3, 61 + vpsllq xmm11, xmm3, 3 + ; rnd_1: 2 - 2 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm3, 6 + ; rnd_1: 5 - 6 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpaddq xmm4, xmm8, xmm4 + ; msg_sched done: 8-9 + ; msg_sched: 10-11 + ; rnd_0: 0 - 0 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpalignr xmm12, xmm6, xmm5, 8 + vpalignr xmm13, xmm2, xmm1, 8 + ; rnd_0: 1 - 1 + add r13, QWORD PTR [rsp+80] + mov rdx, r11 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm5, xmm13, xmm5 + ; rnd_0: 6 - 7 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpaddq xmm5, xmm8, xmm5 + ; rnd_1: 0 - 0 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq xmm8, xmm4, 19 + vpsllq xmm9, xmm4, 45 + ; rnd_1: 1 - 1 + add r12, QWORD PTR [rsp+88] + mov rbx, r10 + xor rcx, rax + vpsrlq xmm10, xmm4, 61 + vpsllq xmm11, xmm4, 3 + ; rnd_1: 2 - 2 + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm4, 6 + ; rnd_1: 5 - 6 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r12, rax + xor rdx, r14 + vpaddq xmm5, xmm8, xmm5 + ; msg_sched done: 10-11 + ; msg_sched: 12-13 + ; rnd_0: 0 - 0 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpalignr xmm12, xmm7, xmm6, 8 + vpalignr xmm13, xmm3, xmm2, 8 + ; rnd_0: 1 - 1 + add r11, QWORD PTR [rsp+96] + mov rdx, r9 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm6, xmm13, xmm6 + ; rnd_0: 6 - 7 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + vpaddq xmm6, xmm8, xmm6 + ; rnd_1: 0 - 0 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpsrlq xmm8, xmm5, 19 + vpsllq xmm9, xmm5, 45 + ; rnd_1: 1 - 1 + add r10, QWORD PTR [rsp+104] + mov rbx, r8 + xor rcx, rax + vpsrlq xmm10, xmm5, 61 + vpsllq xmm11, xmm5, 3 + ; rnd_1: 2 - 2 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm5, 6 + ; rnd_1: 5 - 6 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpaddq xmm6, xmm8, xmm6 + ; msg_sched done: 12-13 + ; msg_sched: 14-15 + ; rnd_0: 0 - 0 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpalignr xmm12, xmm0, xmm7, 8 + vpalignr xmm13, xmm4, xmm3, 8 + ; rnd_0: 1 - 1 + add r9, QWORD PTR [rsp+112] + mov rdx, r15 + xor rcx, rax + vpsrlq xmm8, xmm12, 1 + vpsllq xmm9, xmm12, 63 + ; rnd_0: 2 - 2 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vpsrlq xmm10, xmm12, 8 + vpsllq xmm11, xmm12, 56 + ; rnd_0: 3 - 3 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_0: 4 - 4 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpsrlq xmm11, xmm12, 7 + vpxor xmm8, xmm8, xmm10 + ; rnd_0: 5 - 5 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpxor xmm8, xmm8, xmm11 + vpaddq xmm7, xmm13, xmm7 + ; rnd_0: 6 - 7 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpaddq xmm7, xmm8, xmm7 + ; rnd_1: 0 - 0 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq xmm8, xmm6, 19 + vpsllq xmm9, xmm6, 45 + ; rnd_1: 1 - 1 + add r8, QWORD PTR [rsp+120] + mov rbx, r14 + xor rcx, rax + vpsrlq xmm10, xmm6, 61 + vpsllq xmm11, xmm6, 3 + ; rnd_1: 2 - 2 + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpor xmm8, xmm8, xmm9 + vpor xmm10, xmm10, xmm11 + ; rnd_1: 3 - 4 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpxor xmm8, xmm8, xmm10 + vpsrlq xmm11, xmm6, 6 + ; rnd_1: 5 - 6 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + vpxor xmm8, xmm8, xmm11 + ; rnd_1: 7 - 7 + and rdx, rbx + add r8, rax + xor rdx, r10 + vpaddq xmm7, xmm8, xmm7 + ; msg_sched done: 14-15 + mov rcx, QWORD PTR [rsp+136] + vpaddq xmm8, xmm0, [rcx] + vpaddq xmm9, xmm1, [rcx+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rcx+32] + vpaddq xmm9, xmm3, [rcx+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rcx+64] + vpaddq xmm9, xmm5, [rcx+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rcx+96] + vpaddq xmm9, xmm7, [rcx+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + sub DWORD PTR [rsp+128], 1 + jne L_sha512_len_avx1_rorx_start + vpaddq xmm8, xmm0, [rcx] + vpaddq xmm9, xmm1, [rcx+16] + vmovdqu OWORD PTR [rsp], xmm8 + vmovdqu OWORD PTR [rsp+16], xmm9 + vpaddq xmm8, xmm2, [rcx+32] + vpaddq xmm9, xmm3, [rcx+48] + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpaddq xmm8, xmm4, [rcx+64] + vpaddq xmm9, xmm5, [rcx+80] + vmovdqu OWORD PTR [rsp+64], xmm8 + vmovdqu OWORD PTR [rsp+80], xmm9 + vpaddq xmm8, xmm6, [rcx+96] + vpaddq xmm9, xmm7, [rcx+112] + vmovdqu OWORD PTR [rsp+96], xmm8 + vmovdqu OWORD PTR [rsp+112], xmm9 + ; rnd_all_2: 0-1 + ; rnd_0: 0 - 7 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsp] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + ; rnd_1: 0 - 7 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsp+8] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 2-3 + ; rnd_0: 0 - 7 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsp+16] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + ; rnd_1: 0 - 7 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsp+24] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 4-5 + ; rnd_0: 0 - 7 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsp+32] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + ; rnd_1: 0 - 7 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsp+40] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 6-7 + ; rnd_0: 0 - 7 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsp+48] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + ; rnd_1: 0 - 7 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsp+56] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + ; rnd_all_2: 8-9 + ; rnd_0: 0 - 7 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsp+64] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + ; rnd_1: 0 - 7 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsp+72] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + add r10, r14 + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 10-11 + ; rnd_0: 0 - 7 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsp+80] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + ; rnd_1: 0 - 7 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsp+88] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + add r8, r12 + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 12-13 + ; rnd_0: 0 - 7 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsp+96] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + ; rnd_1: 0 - 7 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsp+104] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + add r14, r10 + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 14-15 + ; rnd_0: 0 - 7 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsp+112] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + ; rnd_1: 0 - 7 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsp+120] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + add r12, r8 + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + add r8, rdx + add r8, QWORD PTR [rdi] + add r9, QWORD PTR [rdi+8] + add r10, QWORD PTR [rdi+16] + add r11, QWORD PTR [rdi+24] + add r12, QWORD PTR [rdi+32] + add r13, QWORD PTR [rdi+40] + add r14, QWORD PTR [rdi+48] + add r15, QWORD PTR [rdi+56] + mov rcx, QWORD PTR [ptr_L_avx1_rorx_sha512_k] + add rsi, 128 + sub ebp, 128 + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + mov QWORD PTR [rdi+32], r12 + mov QWORD PTR [rdi+40], r13 + mov QWORD PTR [rdi+48], r14 + mov QWORD PTR [rdi+56], r15 + jnz L_sha512_len_avx1_rorx_begin + xor rax, rax + vmovdqu xmm6, OWORD PTR [rsp+144] + vmovdqu xmm7, OWORD PTR [rsp+160] + vmovdqu xmm8, OWORD PTR [rsp+176] + vmovdqu xmm9, OWORD PTR [rsp+192] + vmovdqu xmm10, OWORD PTR [rsp+208] + vmovdqu xmm11, OWORD PTR [rsp+224] + vmovdqu xmm13, OWORD PTR [rsp+240] + vmovdqu xmm12, OWORD PTR [rsp+256] + vmovdqu xmm14, OWORD PTR [rsp+272] + add rsp, 288 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX1_RORX_Len ENDP +_TEXT ENDS +ENDIF +IFDEF HAVE_INTEL_AVX2 +_DATA SEGMENT +ALIGN 16 +L_avx2_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h +ptr_L_avx2_sha512_k QWORD L_avx2_sha512_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_sha512_k_2 QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h +ptr_L_avx2_sha512_k_2 QWORD L_avx2_sha512_k_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 8 +L_avx2_sha512_k_2_end QWORD 1024+L_avx2_sha512_k_2 +ptr_L_avx2_sha512_k_2_end QWORD L_avx2_sha512_k_2_end +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh + QWORD 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_avx2_sha512_flip_mask QWORD L_avx2_sha512_flip_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rdi, rcx + sub rsp, 296 + vmovdqu OWORD PTR [rsp+136], xmm6 + vmovdqu OWORD PTR [rsp+152], xmm7 + vmovdqu OWORD PTR [rsp+168], xmm8 + vmovdqu OWORD PTR [rsp+184], xmm9 + vmovdqu OWORD PTR [rsp+200], xmm10 + vmovdqu OWORD PTR [rsp+216], xmm11 + vmovdqu OWORD PTR [rsp+232], xmm14 + vmovdqu OWORD PTR [rsp+248], xmm13 + vmovdqu OWORD PTR [rsp+264], xmm12 + vmovdqu OWORD PTR [rsp+280], xmm15 + lea rax, QWORD PTR [rdi+64] + vmovdqu ymm15, YMMWORD PTR L_avx2_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + vmovdqu ymm0, YMMWORD PTR [rax] + vmovdqu ymm1, YMMWORD PTR [rax+32] + vpshufb ymm0, ymm0, ymm15 + vpshufb ymm1, ymm1, ymm15 + vmovdqu ymm2, YMMWORD PTR [rax+64] + vmovdqu ymm3, YMMWORD PTR [rax+96] + vpshufb ymm2, ymm2, ymm15 + vpshufb ymm3, ymm3, ymm15 + mov DWORD PTR [rsp+128], 4 + mov rsi, QWORD PTR [ptr_L_avx2_sha512_k] + mov rbx, r9 + mov rax, r12 + xor rbx, r10 + vpaddq ymm8, ymm0, [rsi] + vpaddq ymm9, ymm1, [rsi+32] + vmovdqu YMMWORD PTR [rsp], ymm8 + vmovdqu YMMWORD PTR [rsp+32], ymm9 + vpaddq ymm8, ymm2, [rsi+64] + vpaddq ymm9, ymm3, [rsi+96] + vmovdqu YMMWORD PTR [rsp+64], ymm8 + vmovdqu YMMWORD PTR [rsp+96], ymm9 + ; Start of 16 rounds +L_sha256_avx2_start: + add rsi, 128 + ror rax, 23 + vpblendd ymm12, ymm0, ymm1, 3 + vpblendd ymm13, ymm2, ymm3, 3 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + vpermq ymm12, ymm12, 57 + ror rax, 4 + xor rcx, r14 + vpermq ymm13, ymm13, 57 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpsrlq ymm8, ymm12, 1 + add r15, rax + mov rcx, r8 + vpsllq ymm9, ymm12, 63 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm10, ymm12, 8 + xor rcx, r8 + xor rbx, r9 + vpsllq ymm11, ymm12, 56 + ror rcx, 6 + add r11, r15 + vpor ymm8, ymm8, ymm9 + xor rcx, r8 + add r15, rbx + vpor ymm10, ymm10, ymm11 + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + vpsrlq ymm11, ymm12, 7 + mov rbx, r15 + mov rcx, r12 + vpxor ymm8, ymm8, ymm10 + add r14, QWORD PTR [rsp+8] + xor rcx, r13 + vpxor ymm8, ymm8, ymm11 + xor rax, r11 + and rcx, r11 + vpaddq ymm0, ymm13, ymm0 + ror rax, 4 + xor rcx, r13 + vpaddq ymm0, ymm8, ymm0 + xor rax, r11 + add r14, rcx + vperm2I128 ymm14, ymm3, ymm3, 129 + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + vpsrlq ymm8, ymm14, 19 + xor rcx, r15 + xor rdx, r8 + vpsllq ymm9, ymm14, 45 + ror rcx, 6 + add r10, r14 + vpsrlq ymm10, ymm14, 61 + xor rcx, r15 + add r14, rdx + vpsllq ymm11, ymm14, 3 + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + vpor ymm8, ymm8, ymm9 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+16] + xor rcx, r12 + vpor ymm10, ymm10, ymm11 + xor rax, r10 + and rcx, r10 + vpxor ymm8, ymm8, ymm10 + ror rax, 4 + xor rcx, r12 + vpsrlq ymm11, ymm14, 6 + xor rax, r10 + add r13, rcx + vpxor ymm8, ymm8, ymm11 + ror rax, 14 + xor rdx, r15 + vpaddq ymm0, ymm8, ymm0 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vperm2I128 ymm14, ymm0, ymm0, 8 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpsrlq ymm8, ymm14, 19 + xor rcx, r14 + add r13, rbx + vpsllq ymm9, ymm14, 45 + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + vpsrlq ymm10, ymm14, 61 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+24] + xor rcx, r11 + vpsllq ymm11, ymm14, 3 + xor rax, r9 + and rcx, r9 + vpor ymm8, ymm8, ymm9 + ror rax, 4 + xor rcx, r11 + vpor ymm10, ymm10, ymm11 + xor rax, r9 + add r12, rcx + vpxor ymm8, ymm8, ymm10 + ror rax, 14 + xor rbx, r14 + vpsrlq ymm11, ymm14, 6 + add r12, rax + mov rcx, r13 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + ror rcx, 5 + vpaddq ymm0, ymm8, ymm0 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + vpblendd ymm12, ymm1, ymm2, 3 + vpblendd ymm13, ymm3, ymm0, 3 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+32] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + vpermq ymm12, ymm12, 57 + ror rax, 4 + xor rcx, r10 + vpermq ymm13, ymm13, 57 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpsrlq ymm8, ymm12, 1 + add r11, rax + mov rcx, r12 + vpsllq ymm9, ymm12, 63 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm10, ymm12, 8 + xor rcx, r12 + xor rbx, r13 + vpsllq ymm11, ymm12, 56 + ror rcx, 6 + add r15, r11 + vpor ymm8, ymm8, ymm9 + xor rcx, r12 + add r11, rbx + vpor ymm10, ymm10, ymm11 + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + vpsrlq ymm11, ymm12, 7 + mov rbx, r11 + mov rcx, r8 + vpxor ymm8, ymm8, ymm10 + add r10, QWORD PTR [rsp+40] + xor rcx, r9 + vpxor ymm8, ymm8, ymm11 + xor rax, r15 + and rcx, r15 + vpaddq ymm1, ymm13, ymm1 + ror rax, 4 + xor rcx, r9 + vpaddq ymm1, ymm8, ymm1 + xor rax, r15 + add r10, rcx + vperm2I128 ymm14, ymm0, ymm0, 129 + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + vpsrlq ymm8, ymm14, 19 + xor rcx, r11 + xor rdx, r12 + vpsllq ymm9, ymm14, 45 + ror rcx, 6 + add r14, r10 + vpsrlq ymm10, ymm14, 61 + xor rcx, r11 + add r10, rdx + vpsllq ymm11, ymm14, 3 + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + vpor ymm8, ymm8, ymm9 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+48] + xor rcx, r8 + vpor ymm10, ymm10, ymm11 + xor rax, r14 + and rcx, r14 + vpxor ymm8, ymm8, ymm10 + ror rax, 4 + xor rcx, r8 + vpsrlq ymm11, ymm14, 6 + xor rax, r14 + add r9, rcx + vpxor ymm8, ymm8, ymm11 + ror rax, 14 + xor rdx, r11 + vpaddq ymm1, ymm8, ymm1 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vperm2I128 ymm14, ymm1, ymm1, 8 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpsrlq ymm8, ymm14, 19 + xor rcx, r10 + add r9, rbx + vpsllq ymm9, ymm14, 45 + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + vpsrlq ymm10, ymm14, 61 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+56] + xor rcx, r15 + vpsllq ymm11, ymm14, 3 + xor rax, r13 + and rcx, r13 + vpor ymm8, ymm8, ymm9 + ror rax, 4 + xor rcx, r15 + vpor ymm10, ymm10, ymm11 + xor rax, r13 + add r8, rcx + vpxor ymm8, ymm8, ymm10 + ror rax, 14 + xor rbx, r10 + vpsrlq ymm11, ymm14, 6 + add r8, rax + mov rcx, r9 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + ror rcx, 5 + vpaddq ymm1, ymm8, ymm1 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + ror rax, 23 + vpblendd ymm12, ymm2, ymm3, 3 + vpblendd ymm13, ymm0, ymm1, 3 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp+64] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + vpermq ymm12, ymm12, 57 + ror rax, 4 + xor rcx, r14 + vpermq ymm13, ymm13, 57 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpsrlq ymm8, ymm12, 1 + add r15, rax + mov rcx, r8 + vpsllq ymm9, ymm12, 63 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm10, ymm12, 8 + xor rcx, r8 + xor rbx, r9 + vpsllq ymm11, ymm12, 56 + ror rcx, 6 + add r11, r15 + vpor ymm8, ymm8, ymm9 + xor rcx, r8 + add r15, rbx + vpor ymm10, ymm10, ymm11 + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + vpsrlq ymm11, ymm12, 7 + mov rbx, r15 + mov rcx, r12 + vpxor ymm8, ymm8, ymm10 + add r14, QWORD PTR [rsp+72] + xor rcx, r13 + vpxor ymm8, ymm8, ymm11 + xor rax, r11 + and rcx, r11 + vpaddq ymm2, ymm13, ymm2 + ror rax, 4 + xor rcx, r13 + vpaddq ymm2, ymm8, ymm2 + xor rax, r11 + add r14, rcx + vperm2I128 ymm14, ymm1, ymm1, 129 + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + vpsrlq ymm8, ymm14, 19 + xor rcx, r15 + xor rdx, r8 + vpsllq ymm9, ymm14, 45 + ror rcx, 6 + add r10, r14 + vpsrlq ymm10, ymm14, 61 + xor rcx, r15 + add r14, rdx + vpsllq ymm11, ymm14, 3 + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + vpor ymm8, ymm8, ymm9 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+80] + xor rcx, r12 + vpor ymm10, ymm10, ymm11 + xor rax, r10 + and rcx, r10 + vpxor ymm8, ymm8, ymm10 + ror rax, 4 + xor rcx, r12 + vpsrlq ymm11, ymm14, 6 + xor rax, r10 + add r13, rcx + vpxor ymm8, ymm8, ymm11 + ror rax, 14 + xor rdx, r15 + vpaddq ymm2, ymm8, ymm2 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vperm2I128 ymm14, ymm2, ymm2, 8 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpsrlq ymm8, ymm14, 19 + xor rcx, r14 + add r13, rbx + vpsllq ymm9, ymm14, 45 + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + vpsrlq ymm10, ymm14, 61 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+88] + xor rcx, r11 + vpsllq ymm11, ymm14, 3 + xor rax, r9 + and rcx, r9 + vpor ymm8, ymm8, ymm9 + ror rax, 4 + xor rcx, r11 + vpor ymm10, ymm10, ymm11 + xor rax, r9 + add r12, rcx + vpxor ymm8, ymm8, ymm10 + ror rax, 14 + xor rbx, r14 + vpsrlq ymm11, ymm14, 6 + add r12, rax + mov rcx, r13 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + ror rcx, 5 + vpaddq ymm2, ymm8, ymm2 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + vpblendd ymm12, ymm3, ymm0, 3 + vpblendd ymm13, ymm1, ymm2, 3 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+96] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + vpermq ymm12, ymm12, 57 + ror rax, 4 + xor rcx, r10 + vpermq ymm13, ymm13, 57 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpsrlq ymm8, ymm12, 1 + add r11, rax + mov rcx, r12 + vpsllq ymm9, ymm12, 63 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm10, ymm12, 8 + xor rcx, r12 + xor rbx, r13 + vpsllq ymm11, ymm12, 56 + ror rcx, 6 + add r15, r11 + vpor ymm8, ymm8, ymm9 + xor rcx, r12 + add r11, rbx + vpor ymm10, ymm10, ymm11 + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + vpsrlq ymm11, ymm12, 7 + mov rbx, r11 + mov rcx, r8 + vpxor ymm8, ymm8, ymm10 + add r10, QWORD PTR [rsp+104] + xor rcx, r9 + vpxor ymm8, ymm8, ymm11 + xor rax, r15 + and rcx, r15 + vpaddq ymm3, ymm13, ymm3 + ror rax, 4 + xor rcx, r9 + vpaddq ymm3, ymm8, ymm3 + xor rax, r15 + add r10, rcx + vperm2I128 ymm14, ymm2, ymm2, 129 + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + vpsrlq ymm8, ymm14, 19 + xor rcx, r11 + xor rdx, r12 + vpsllq ymm9, ymm14, 45 + ror rcx, 6 + add r14, r10 + vpsrlq ymm10, ymm14, 61 + xor rcx, r11 + add r10, rdx + vpsllq ymm11, ymm14, 3 + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + vpor ymm8, ymm8, ymm9 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+112] + xor rcx, r8 + vpor ymm10, ymm10, ymm11 + xor rax, r14 + and rcx, r14 + vpxor ymm8, ymm8, ymm10 + ror rax, 4 + xor rcx, r8 + vpsrlq ymm11, ymm14, 6 + xor rax, r14 + add r9, rcx + vpxor ymm8, ymm8, ymm11 + ror rax, 14 + xor rdx, r11 + vpaddq ymm3, ymm8, ymm3 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vperm2I128 ymm14, ymm3, ymm3, 8 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpsrlq ymm8, ymm14, 19 + xor rcx, r10 + add r9, rbx + vpsllq ymm9, ymm14, 45 + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + vpsrlq ymm10, ymm14, 61 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+120] + xor rcx, r15 + vpsllq ymm11, ymm14, 3 + xor rax, r13 + and rcx, r13 + vpor ymm8, ymm8, ymm9 + ror rax, 4 + xor rcx, r15 + vpor ymm10, ymm10, ymm11 + xor rax, r13 + add r8, rcx + vpxor ymm8, ymm8, ymm10 + ror rax, 14 + xor rbx, r10 + vpsrlq ymm11, ymm14, 6 + add r8, rax + mov rcx, r9 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + ror rcx, 5 + vpaddq ymm3, ymm8, ymm3 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + vpaddq ymm8, ymm0, [rsi] + vpaddq ymm9, ymm1, [rsi+32] + vmovdqu YMMWORD PTR [rsp], ymm8 + vmovdqu YMMWORD PTR [rsp+32], ymm9 + vpaddq ymm8, ymm2, [rsi+64] + vpaddq ymm9, ymm3, [rsi+96] + vmovdqu YMMWORD PTR [rsp+64], ymm8 + vmovdqu YMMWORD PTR [rsp+96], ymm9 + sub DWORD PTR [rsp+128], 1 + jne L_sha256_avx2_start + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+8] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+16] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+24] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+32] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+40] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+48] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+56] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rsp+64] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rsp+72] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rsp+80] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rsp+88] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rsp+96] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rsp+104] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rsp+112] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rsp+120] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + add QWORD PTR [rdi], r8 + add QWORD PTR [rdi+8], r9 + add QWORD PTR [rdi+16], r10 + add QWORD PTR [rdi+24], r11 + add QWORD PTR [rdi+32], r12 + add QWORD PTR [rdi+40], r13 + add QWORD PTR [rdi+48], r14 + add QWORD PTR [rdi+56], r15 + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+136] + vmovdqu xmm7, OWORD PTR [rsp+152] + vmovdqu xmm8, OWORD PTR [rsp+168] + vmovdqu xmm9, OWORD PTR [rsp+184] + vmovdqu xmm10, OWORD PTR [rsp+200] + vmovdqu xmm11, OWORD PTR [rsp+216] + vmovdqu xmm14, OWORD PTR [rsp+232] + vmovdqu xmm13, OWORD PTR [rsp+248] + vmovdqu xmm12, OWORD PTR [rsp+264] + vmovdqu xmm15, OWORD PTR [rsp+280] + add rsp, 296 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX2_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rsi + push rdi + push rbp + mov rdi, rcx + mov rbp, rdx + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm14 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm12 + vmovdqu OWORD PTR [rsp+144], xmm15 + test bpl, 128 + je L_sha512_len_avx2_block + mov rbx, QWORD PTR [rdi+224] + vmovdqu ymm0, YMMWORD PTR [rbx] + vmovdqu ymm1, YMMWORD PTR [rbx+32] + vmovdqu ymm2, YMMWORD PTR [rbx+64] + vmovdqu ymm3, YMMWORD PTR [rbx+96] + vmovups YMMWORD PTR [rdi+64], ymm0 + vmovups YMMWORD PTR [rdi+96], ymm1 + vmovups YMMWORD PTR [rdi+128], ymm2 + vmovups YMMWORD PTR [rdi+160], ymm3 + call Transform_Sha512_AVX2 + add QWORD PTR [rdi+224], 128 + sub ebp, 128 + jz L_sha512_len_avx2_done +L_sha512_len_avx2_block: + sub rsp, 1352 + mov rcx, QWORD PTR [rdi+224] + vmovdqu ymm15, YMMWORD PTR L_avx2_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + mov QWORD PTR [rsp+1344], rbp + ; Start of loop processing two blocks +L_sha512_len_avx2_begin: + mov rbp, rsp + mov rsi, QWORD PTR [ptr_L_avx2_sha512_k_2] + mov rbx, r9 + mov rax, r12 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vinserti128 ymm0, ymm0, OWORD PTR [rcx+128], 1 + vinserti128 ymm1, ymm1, OWORD PTR [rcx+144], 1 + vpshufb ymm0, ymm0, ymm15 + vpshufb ymm1, ymm1, ymm15 + vmovdqu xmm2, OWORD PTR [rcx+32] + vmovdqu xmm3, OWORD PTR [rcx+48] + vinserti128 ymm2, ymm2, OWORD PTR [rcx+160], 1 + vinserti128 ymm3, ymm3, OWORD PTR [rcx+176], 1 + vpshufb ymm2, ymm2, ymm15 + vpshufb ymm3, ymm3, ymm15 + vmovdqu xmm4, OWORD PTR [rcx+64] + vmovdqu xmm5, OWORD PTR [rcx+80] + vinserti128 ymm4, ymm4, OWORD PTR [rcx+192], 1 + vinserti128 ymm5, ymm5, OWORD PTR [rcx+208], 1 + vpshufb ymm4, ymm4, ymm15 + vpshufb ymm5, ymm5, ymm15 + vmovdqu xmm6, OWORD PTR [rcx+96] + vmovdqu xmm7, OWORD PTR [rcx+112] + vinserti128 ymm6, ymm6, OWORD PTR [rcx+224], 1 + vinserti128 ymm7, ymm7, OWORD PTR [rcx+240], 1 + vpshufb ymm6, ymm6, ymm15 + vpshufb ymm7, ymm7, ymm15 + xor rbx, r10 + ; Start of 16 rounds +L_sha512_len_avx2_start: + vpaddq ymm8, ymm0, [rsi] + vpaddq ymm9, ymm1, [rsi+32] + vmovdqu YMMWORD PTR [rbp], ymm8 + vmovdqu YMMWORD PTR [rbp+32], ymm9 + vpaddq ymm8, ymm2, [rsi+64] + vpaddq ymm9, ymm3, [rsi+96] + vmovdqu YMMWORD PTR [rbp+64], ymm8 + vmovdqu YMMWORD PTR [rbp+96], ymm9 + vpaddq ymm8, ymm4, [rsi+128] + vpaddq ymm9, ymm5, [rsi+160] + vmovdqu YMMWORD PTR [rbp+128], ymm8 + vmovdqu YMMWORD PTR [rbp+160], ymm9 + vpaddq ymm8, ymm6, [rsi+192] + vpaddq ymm9, ymm7, [rsi+224] + vmovdqu YMMWORD PTR [rbp+192], ymm8 + vmovdqu YMMWORD PTR [rbp+224], ymm9 + ; msg_sched: 0-1 + ror rax, 23 + vpalignr ymm12, ymm1, ymm0, 8 + vpalignr ymm13, ymm5, ymm4, 8 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rbp] + xor rcx, r14 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm0, ymm13, ymm0 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + vpaddq ymm0, ymm8, ymm0 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rbp+8] + xor rcx, r13 + vpsrlq ymm8, ymm7, 19 + vpsllq ymm9, ymm7, 45 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + vpsrlq ymm10, ymm7, 61 + vpsllq ymm11, ymm7, 3 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm7, 6 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r10 + add r14, rcx + vpaddq ymm0, ymm8, ymm0 + ; msg_sched done: 0-1 + ; msg_sched: 4-5 + ror rax, 23 + vpalignr ymm12, ymm2, ymm1, 8 + vpalignr ymm13, ymm6, ymm5, 8 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rbp+32] + xor rcx, r12 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm1, ymm13, ymm1 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + vpaddq ymm1, ymm8, ymm1 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rbp+40] + xor rcx, r11 + vpsrlq ymm8, ymm0, 19 + vpsllq ymm9, ymm0, 45 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + vpsrlq ymm10, ymm0, 61 + vpsllq ymm11, ymm0, 3 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm0, 6 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r8 + add r12, rcx + vpaddq ymm1, ymm8, ymm1 + ; msg_sched done: 4-5 + ; msg_sched: 8-9 + ror rax, 23 + vpalignr ymm12, ymm3, ymm2, 8 + vpalignr ymm13, ymm7, ymm6, 8 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rbp+64] + xor rcx, r10 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm2, ymm13, ymm2 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + vpaddq ymm2, ymm8, ymm2 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rbp+72] + xor rcx, r9 + vpsrlq ymm8, ymm1, 19 + vpsllq ymm9, ymm1, 45 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + vpsrlq ymm10, ymm1, 61 + vpsllq ymm11, ymm1, 3 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm1, 6 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r14 + add r10, rcx + vpaddq ymm2, ymm8, ymm2 + ; msg_sched done: 8-9 + ; msg_sched: 12-13 + ror rax, 23 + vpalignr ymm12, ymm4, ymm3, 8 + vpalignr ymm13, ymm0, ymm7, 8 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rbp+96] + xor rcx, r8 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm3, ymm13, ymm3 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + vpaddq ymm3, ymm8, ymm3 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rbp+104] + xor rcx, r15 + vpsrlq ymm8, ymm2, 19 + vpsllq ymm9, ymm2, 45 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + vpsrlq ymm10, ymm2, 61 + vpsllq ymm11, ymm2, 3 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm2, 6 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r12 + add r8, rcx + vpaddq ymm3, ymm8, ymm3 + ; msg_sched done: 12-13 + ; msg_sched: 16-17 + ror rax, 23 + vpalignr ymm12, ymm5, ymm4, 8 + vpalignr ymm13, ymm1, ymm0, 8 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rbp+128] + xor rcx, r14 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm4, ymm13, ymm4 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + vpaddq ymm4, ymm8, ymm4 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rbp+136] + xor rcx, r13 + vpsrlq ymm8, ymm3, 19 + vpsllq ymm9, ymm3, 45 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + vpsrlq ymm10, ymm3, 61 + vpsllq ymm11, ymm3, 3 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm3, 6 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r10 + add r14, rcx + vpaddq ymm4, ymm8, ymm4 + ; msg_sched done: 16-17 + ; msg_sched: 20-21 + ror rax, 23 + vpalignr ymm12, ymm6, ymm5, 8 + vpalignr ymm13, ymm2, ymm1, 8 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rbp+160] + xor rcx, r12 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm5, ymm13, ymm5 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + vpaddq ymm5, ymm8, ymm5 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rbp+168] + xor rcx, r11 + vpsrlq ymm8, ymm4, 19 + vpsllq ymm9, ymm4, 45 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + vpsrlq ymm10, ymm4, 61 + vpsllq ymm11, ymm4, 3 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm4, 6 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r8 + add r12, rcx + vpaddq ymm5, ymm8, ymm5 + ; msg_sched done: 20-21 + ; msg_sched: 24-25 + ror rax, 23 + vpalignr ymm12, ymm7, ymm6, 8 + vpalignr ymm13, ymm3, ymm2, 8 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rbp+192] + xor rcx, r10 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm6, ymm13, ymm6 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + vpaddq ymm6, ymm8, ymm6 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rbp+200] + xor rcx, r9 + vpsrlq ymm8, ymm5, 19 + vpsllq ymm9, ymm5, 45 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + vpsrlq ymm10, ymm5, 61 + vpsllq ymm11, ymm5, 3 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm5, 6 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r14 + add r10, rcx + vpaddq ymm6, ymm8, ymm6 + ; msg_sched done: 24-25 + ; msg_sched: 28-29 + ror rax, 23 + vpalignr ymm12, ymm0, ymm7, 8 + vpalignr ymm13, ymm4, ymm3, 8 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rbp+224] + xor rcx, r8 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm7, ymm13, ymm7 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + vpaddq ymm7, ymm8, ymm7 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rbp+232] + xor rcx, r15 + vpsrlq ymm8, ymm6, 19 + vpsllq ymm9, ymm6, 45 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + vpsrlq ymm10, ymm6, 61 + vpsllq ymm11, ymm6, 3 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm6, 6 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + vpxor ymm8, ymm8, ymm11 + ror rcx, 28 + mov rax, r12 + add r8, rcx + vpaddq ymm7, ymm8, ymm7 + ; msg_sched done: 28-29 + add rsi, 256 + add rbp, 256 + cmp rsi, QWORD PTR [L_avx2_sha512_k_2_end] + jne L_sha512_len_avx2_start + vpaddq ymm8, ymm0, [rsi] + vpaddq ymm9, ymm1, [rsi+32] + vmovdqu YMMWORD PTR [rbp], ymm8 + vmovdqu YMMWORD PTR [rbp+32], ymm9 + vpaddq ymm8, ymm2, [rsi+64] + vpaddq ymm9, ymm3, [rsi+96] + vmovdqu YMMWORD PTR [rbp+64], ymm8 + vmovdqu YMMWORD PTR [rbp+96], ymm9 + vpaddq ymm8, ymm4, [rsi+128] + vpaddq ymm9, ymm5, [rsi+160] + vmovdqu YMMWORD PTR [rbp+128], ymm8 + vmovdqu YMMWORD PTR [rbp+160], ymm9 + vpaddq ymm8, ymm6, [rsi+192] + vpaddq ymm9, ymm7, [rsi+224] + vmovdqu YMMWORD PTR [rbp+192], ymm8 + vmovdqu YMMWORD PTR [rbp+224], ymm9 + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rbp] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rbp+8] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rbp+32] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rbp+40] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rbp+64] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rbp+72] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rbp+96] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rbp+104] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rbp+128] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rbp+136] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rbp+160] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rbp+168] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rbp+192] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rbp+200] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rbp+224] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rbp+232] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + sub rbp, 1024 + add r8, QWORD PTR [rdi] + add r9, QWORD PTR [rdi+8] + add r10, QWORD PTR [rdi+16] + add r11, QWORD PTR [rdi+24] + add r12, QWORD PTR [rdi+32] + add r13, QWORD PTR [rdi+40] + add r14, QWORD PTR [rdi+48] + add r15, QWORD PTR [rdi+56] + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + mov QWORD PTR [rdi+32], r12 + mov QWORD PTR [rdi+40], r13 + mov QWORD PTR [rdi+48], r14 + mov QWORD PTR [rdi+56], r15 + mov rbx, r9 + mov rax, r12 + xor rbx, r10 + mov rsi, 5 +L_sha512_len_avx2_tail: + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rbp+16] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rbp+24] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rbp+48] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rbp+56] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rbp+80] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rbp+88] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rbp+112] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rbp+120] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + ror rax, 23 + mov rdx, r8 + mov rcx, r13 + add r15, QWORD PTR [rbp+144] + xor rcx, r14 + xor rax, r12 + and rcx, r12 + ror rax, 4 + xor rcx, r14 + xor rax, r12 + add r15, rcx + ror rax, 14 + xor rdx, r9 + add r15, rax + mov rcx, r8 + and rbx, rdx + ror rcx, 5 + xor rcx, r8 + xor rbx, r9 + ror rcx, 6 + add r11, r15 + xor rcx, r8 + add r15, rbx + ror rcx, 28 + mov rax, r11 + add r15, rcx + ror rax, 23 + mov rbx, r15 + mov rcx, r12 + add r14, QWORD PTR [rbp+152] + xor rcx, r13 + xor rax, r11 + and rcx, r11 + ror rax, 4 + xor rcx, r13 + xor rax, r11 + add r14, rcx + ror rax, 14 + xor rbx, r8 + add r14, rax + mov rcx, r15 + and rdx, rbx + ror rcx, 5 + xor rcx, r15 + xor rdx, r8 + ror rcx, 6 + add r10, r14 + xor rcx, r15 + add r14, rdx + ror rcx, 28 + mov rax, r10 + add r14, rcx + ror rax, 23 + mov rdx, r14 + mov rcx, r11 + add r13, QWORD PTR [rbp+176] + xor rcx, r12 + xor rax, r10 + and rcx, r10 + ror rax, 4 + xor rcx, r12 + xor rax, r10 + add r13, rcx + ror rax, 14 + xor rdx, r15 + add r13, rax + mov rcx, r14 + and rbx, rdx + ror rcx, 5 + xor rcx, r14 + xor rbx, r15 + ror rcx, 6 + add r9, r13 + xor rcx, r14 + add r13, rbx + ror rcx, 28 + mov rax, r9 + add r13, rcx + ror rax, 23 + mov rbx, r13 + mov rcx, r10 + add r12, QWORD PTR [rbp+184] + xor rcx, r11 + xor rax, r9 + and rcx, r9 + ror rax, 4 + xor rcx, r11 + xor rax, r9 + add r12, rcx + ror rax, 14 + xor rbx, r14 + add r12, rax + mov rcx, r13 + and rdx, rbx + ror rcx, 5 + xor rcx, r13 + xor rdx, r14 + ror rcx, 6 + add r8, r12 + xor rcx, r13 + add r12, rdx + ror rcx, 28 + mov rax, r8 + add r12, rcx + ror rax, 23 + mov rdx, r12 + mov rcx, r9 + add r11, QWORD PTR [rbp+208] + xor rcx, r10 + xor rax, r8 + and rcx, r8 + ror rax, 4 + xor rcx, r10 + xor rax, r8 + add r11, rcx + ror rax, 14 + xor rdx, r13 + add r11, rax + mov rcx, r12 + and rbx, rdx + ror rcx, 5 + xor rcx, r12 + xor rbx, r13 + ror rcx, 6 + add r15, r11 + xor rcx, r12 + add r11, rbx + ror rcx, 28 + mov rax, r15 + add r11, rcx + ror rax, 23 + mov rbx, r11 + mov rcx, r8 + add r10, QWORD PTR [rbp+216] + xor rcx, r9 + xor rax, r15 + and rcx, r15 + ror rax, 4 + xor rcx, r9 + xor rax, r15 + add r10, rcx + ror rax, 14 + xor rbx, r12 + add r10, rax + mov rcx, r11 + and rdx, rbx + ror rcx, 5 + xor rcx, r11 + xor rdx, r12 + ror rcx, 6 + add r14, r10 + xor rcx, r11 + add r10, rdx + ror rcx, 28 + mov rax, r14 + add r10, rcx + ror rax, 23 + mov rdx, r10 + mov rcx, r15 + add r9, QWORD PTR [rbp+240] + xor rcx, r8 + xor rax, r14 + and rcx, r14 + ror rax, 4 + xor rcx, r8 + xor rax, r14 + add r9, rcx + ror rax, 14 + xor rdx, r11 + add r9, rax + mov rcx, r10 + and rbx, rdx + ror rcx, 5 + xor rcx, r10 + xor rbx, r11 + ror rcx, 6 + add r13, r9 + xor rcx, r10 + add r9, rbx + ror rcx, 28 + mov rax, r13 + add r9, rcx + ror rax, 23 + mov rbx, r9 + mov rcx, r14 + add r8, QWORD PTR [rbp+248] + xor rcx, r15 + xor rax, r13 + and rcx, r13 + ror rax, 4 + xor rcx, r15 + xor rax, r13 + add r8, rcx + ror rax, 14 + xor rbx, r10 + add r8, rax + mov rcx, r9 + and rdx, rbx + ror rcx, 5 + xor rcx, r9 + xor rdx, r10 + ror rcx, 6 + add r12, r8 + xor rcx, r9 + add r8, rdx + ror rcx, 28 + mov rax, r12 + add r8, rcx + add rbp, 256 + sub rsi, 1 + jnz L_sha512_len_avx2_tail + add r8, QWORD PTR [rdi] + add r9, QWORD PTR [rdi+8] + add r10, QWORD PTR [rdi+16] + add r11, QWORD PTR [rdi+24] + add r12, QWORD PTR [rdi+32] + add r13, QWORD PTR [rdi+40] + add r14, QWORD PTR [rdi+48] + add r15, QWORD PTR [rdi+56] + mov rcx, QWORD PTR [rdi+224] + add rcx, 256 + sub DWORD PTR [rsp+1344], 256 + mov QWORD PTR [rdi+224], rcx + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + mov QWORD PTR [rdi+32], r12 + mov QWORD PTR [rdi+40], r13 + mov QWORD PTR [rdi+48], r14 + mov QWORD PTR [rdi+56], r15 + jnz L_sha512_len_avx2_begin + add rsp, 1352 +L_sha512_len_avx2_done: + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm14, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm12, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop rbp + pop rdi + pop rsi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX2_Len ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_rorx_sha512_k QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h +ptr_L_avx2_rorx_sha512_k QWORD L_avx2_rorx_sha512_k +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_rorx_sha512_k_2 QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 428a2f98d728ae22h, 7137449123ef65cdh + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 0b5c0fbcfec4d3b2fh, 0e9b5dba58189dbbch + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 3956c25bf348b538h, 59f111f1b605d019h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 923f82a4af194f9bh, 0ab1c5ed5da6d8118h + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 0d807aa98a3030242h, 12835b0145706fbeh + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 243185be4ee4b28ch, 550c7dc3d5ffb4e2h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 72be5d74f27b896fh, 80deb1fe3b1696b1h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 9bdc06a725c71235h, 0c19bf174cf692694h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0e49b69c19ef14ad2h, 0efbe4786384f25e3h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 0fc19dc68b8cd5b5h, 240ca1cc77ac9c65h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 2de92c6f592b0275h, 4a7484aa6ea6e483h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 5cb0a9dcbd41fbd4h, 76f988da831153b5h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 983e5152ee66dfabh, 0a831c66d2db43210h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0b00327c898fb213fh, 0bf597fc7beef0ee4h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 0c6e00bf33da88fc2h, 0d5a79147930aa725h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 06ca6351e003826fh, 142929670a0e6e70h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 27b70a8546d22ffch, 2e1b21385c26c926h + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 4d2c6dfc5ac42aedh, 53380d139d95b3dfh + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 650a73548baf63deh, 766a0abb3c77b2a8h + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 81c2c92e47edaee6h, 92722c851482353bh + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0a2bfe8a14cf10364h, 0a81a664bbc423001h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0c24b8b70d0f89791h, 0c76c51a30654be30h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0d192e819d6ef5218h, 0d69906245565a910h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 0f40e35855771202ah, 106aa07032bbd1b8h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 19a4c116b8d2d0c8h, 1e376c085141ab53h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 2748774cdf8eeb99h, 34b0bcb5e19b48a8h + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 391c0cb3c5c95a63h, 4ed8aa4ae3418acbh + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 5b9cca4f7763e373h, 682e6ff3d6b2b8a3h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 748f82ee5defb2fch, 78a5636f43172f60h + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 84c87814a1f0ab72h, 8cc702081a6439ech + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 90befffa23631e28h, 0a4506cebde82bde9h + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0bef9a3f7b2c67915h, 0c67178f2e372532bh + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0ca273eceea26619ch, 0d186b8c721c0c207h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 0eada7dd6cde0eb1eh, 0f57d4f7fee6ed178h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 06f067aa72176fbah, 0a637dc5a2c898a6h + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 113f9804bef90daeh, 1b710b35131c471bh + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 28db77f523047d84h, 32caab7b40c72493h + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 3c9ebe0a15c9bebch, 431d67c49c100d4ch + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 4cc5d4becb3e42b6h, 597f299cfc657e2ah + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h + QWORD 5fcb6fab3ad6faech, 6c44198c4a475817h +ptr_L_avx2_rorx_sha512_k_2 QWORD L_avx2_rorx_sha512_k_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 8 +L_avx2_rorx_sha512_k_2_end QWORD 1024+L_avx2_rorx_sha512_k_2 +ptr_L_avx2_rorx_sha512_k_2_end QWORD L_avx2_rorx_sha512_k_2_end +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_rorx_sha512_flip_mask QWORD 0001020304050607h, 08090a0b0c0d0e0fh + QWORD 0001020304050607h, 08090a0b0c0d0e0fh +ptr_L_avx2_rorx_sha512_flip_mask QWORD L_avx2_rorx_sha512_flip_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX2_RORX PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + mov rdi, rcx + sub rsp, 296 + vmovdqu OWORD PTR [rsp+136], xmm6 + vmovdqu OWORD PTR [rsp+152], xmm7 + vmovdqu OWORD PTR [rsp+168], xmm8 + vmovdqu OWORD PTR [rsp+184], xmm9 + vmovdqu OWORD PTR [rsp+200], xmm10 + vmovdqu OWORD PTR [rsp+216], xmm11 + vmovdqu OWORD PTR [rsp+232], xmm14 + vmovdqu OWORD PTR [rsp+248], xmm13 + vmovdqu OWORD PTR [rsp+264], xmm12 + vmovdqu OWORD PTR [rsp+280], xmm15 + lea rcx, QWORD PTR [rdi+64] + vmovdqu ymm15, YMMWORD PTR L_avx2_rorx_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vpshufb ymm0, ymm0, ymm15 + vpshufb ymm1, ymm1, ymm15 + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpshufb ymm2, ymm2, ymm15 + vpshufb ymm3, ymm3, ymm15 + mov DWORD PTR [rsp+128], 4 + mov rsi, QWORD PTR [ptr_L_avx2_rorx_sha512_k] + mov rbx, r9 + xor rdx, rdx + xor rbx, r10 + ; set_w_k: 0 + vpaddq ymm8, ymm0, [rsi] + vpaddq ymm9, ymm1, [rsi+32] + vmovdqu YMMWORD PTR [rsp], ymm8 + vmovdqu YMMWORD PTR [rsp+32], ymm9 + vpaddq ymm8, ymm2, [rsi+64] + vpaddq ymm9, ymm3, [rsi+96] + vmovdqu YMMWORD PTR [rsp+64], ymm8 + vmovdqu YMMWORD PTR [rsp+96], ymm9 + ; Start of 16 rounds +L_sha256_len_avx2_rorx_start: + add rsi, 128 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpblendd ymm12, ymm0, ymm1, 3 + vpblendd ymm13, ymm2, ymm3, 3 + add r15, QWORD PTR [rsp] + mov rdx, r13 + xor rcx, rax + vpermq ymm12, ymm12, 57 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpermq ymm13, ymm13, 57 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + vperm2I128 ymm14, ymm3, ymm3, 129 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpxor ymm8, ymm8, ymm10 + add r14, QWORD PTR [rsp+8] + mov rbx, r12 + xor rcx, rax + vpxor ymm8, ymm8, ymm11 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpaddq ymm0, ymm13, ymm0 + vpaddq ymm0, ymm8, ymm0 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + vpor ymm10, ymm10, ymm11 + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + vpxor ymm8, ymm8, ymm10 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpsrlq ymm11, ymm14, 6 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpxor ymm8, ymm8, ymm11 + add r13, QWORD PTR [rsp+16] + mov rdx, r11 + xor rcx, rax + vpaddq ymm0, ymm8, ymm0 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vperm2I128 ymm14, ymm0, ymm0, 8 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + vpor ymm10, ymm10, ymm11 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpxor ymm8, ymm8, ymm10 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq ymm11, ymm14, 6 + add r12, QWORD PTR [rsp+24] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpxor ymm8, ymm8, ymm11 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpaddq ymm0, ymm8, ymm0 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + vpaddq ymm8, ymm0, [rsi] + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + vmovdqu YMMWORD PTR [rsp], ymm8 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpblendd ymm12, ymm1, ymm2, 3 + vpblendd ymm13, ymm3, ymm0, 3 + add r11, QWORD PTR [rsp+32] + mov rdx, r9 + xor rcx, rax + vpermq ymm12, ymm12, 57 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpermq ymm13, ymm13, 57 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + vperm2I128 ymm14, ymm0, ymm0, 129 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpxor ymm8, ymm8, ymm10 + add r10, QWORD PTR [rsp+40] + mov rbx, r8 + xor rcx, rax + vpxor ymm8, ymm8, ymm11 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpaddq ymm1, ymm13, ymm1 + vpaddq ymm1, ymm8, ymm1 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + vpor ymm10, ymm10, ymm11 + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + vpxor ymm8, ymm8, ymm10 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpsrlq ymm11, ymm14, 6 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpxor ymm8, ymm8, ymm11 + add r9, QWORD PTR [rsp+48] + mov rdx, r15 + xor rcx, rax + vpaddq ymm1, ymm8, ymm1 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vperm2I128 ymm14, ymm1, ymm1, 8 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + vpor ymm10, ymm10, ymm11 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpxor ymm8, ymm8, ymm10 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq ymm11, ymm14, 6 + add r8, QWORD PTR [rsp+56] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpxor ymm8, ymm8, ymm11 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpaddq ymm1, ymm8, ymm1 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + vpaddq ymm8, ymm1, [rsi+32] + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + vmovdqu YMMWORD PTR [rsp+32], ymm8 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpblendd ymm12, ymm2, ymm3, 3 + vpblendd ymm13, ymm0, ymm1, 3 + add r15, QWORD PTR [rsp+64] + mov rdx, r13 + xor rcx, rax + vpermq ymm12, ymm12, 57 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpermq ymm13, ymm13, 57 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + vperm2I128 ymm14, ymm1, ymm1, 129 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpxor ymm8, ymm8, ymm10 + add r14, QWORD PTR [rsp+72] + mov rbx, r12 + xor rcx, rax + vpxor ymm8, ymm8, ymm11 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpaddq ymm2, ymm13, ymm2 + vpaddq ymm2, ymm8, ymm2 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + vpor ymm10, ymm10, ymm11 + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + vpxor ymm8, ymm8, ymm10 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpsrlq ymm11, ymm14, 6 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpxor ymm8, ymm8, ymm11 + add r13, QWORD PTR [rsp+80] + mov rdx, r11 + xor rcx, rax + vpaddq ymm2, ymm8, ymm2 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vperm2I128 ymm14, ymm2, ymm2, 8 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + vpor ymm10, ymm10, ymm11 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpxor ymm8, ymm8, ymm10 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq ymm11, ymm14, 6 + add r12, QWORD PTR [rsp+88] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpxor ymm8, ymm8, ymm11 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpaddq ymm2, ymm8, ymm2 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + vpaddq ymm8, ymm2, [rsi+64] + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + vmovdqu YMMWORD PTR [rsp+64], ymm8 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpblendd ymm12, ymm3, ymm0, 3 + vpblendd ymm13, ymm1, ymm2, 3 + add r11, QWORD PTR [rsp+96] + mov rdx, r9 + xor rcx, rax + vpermq ymm12, ymm12, 57 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpermq ymm13, ymm13, 57 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + vperm2I128 ymm14, ymm2, ymm2, 129 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpxor ymm8, ymm8, ymm10 + add r10, QWORD PTR [rsp+104] + mov rbx, r8 + xor rcx, rax + vpxor ymm8, ymm8, ymm11 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpaddq ymm3, ymm13, ymm3 + vpaddq ymm3, ymm8, ymm3 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + vpor ymm10, ymm10, ymm11 + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + vpxor ymm8, ymm8, ymm10 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpsrlq ymm11, ymm14, 6 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpxor ymm8, ymm8, ymm11 + add r9, QWORD PTR [rsp+112] + mov rdx, r15 + xor rcx, rax + vpaddq ymm3, ymm8, ymm3 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vperm2I128 ymm14, ymm3, ymm3, 8 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpsrlq ymm8, ymm14, 19 + vpsllq ymm9, ymm14, 45 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpsrlq ymm10, ymm14, 61 + vpsllq ymm11, ymm14, 3 + vpor ymm8, ymm8, ymm9 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + vpor ymm10, ymm10, ymm11 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpxor ymm8, ymm8, ymm10 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq ymm11, ymm14, 6 + add r8, QWORD PTR [rsp+120] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpxor ymm8, ymm8, ymm11 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpaddq ymm3, ymm8, ymm3 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + vpaddq ymm8, ymm3, [rsi+96] + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + vmovdqu YMMWORD PTR [rsp+96], ymm8 + sub DWORD PTR [rsp+128], 1 + jne L_sha256_len_avx2_rorx_start + ; rnd_all_4: 0-3 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsp] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsp+8] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsp+16] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsp+24] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_4: 4-7 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsp+32] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsp+40] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsp+48] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsp+56] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + ; rnd_all_4: 8-11 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsp+64] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsp+72] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsp+80] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsp+88] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_4: 12-15 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsp+96] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsp+104] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsp+112] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsp+120] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + add r8, rdx + add QWORD PTR [rdi], r8 + add QWORD PTR [rdi+8], r9 + add QWORD PTR [rdi+16], r10 + add QWORD PTR [rdi+24], r11 + add QWORD PTR [rdi+32], r12 + add QWORD PTR [rdi+40], r13 + add QWORD PTR [rdi+48], r14 + add QWORD PTR [rdi+56], r15 + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+136] + vmovdqu xmm7, OWORD PTR [rsp+152] + vmovdqu xmm8, OWORD PTR [rsp+168] + vmovdqu xmm9, OWORD PTR [rsp+184] + vmovdqu xmm10, OWORD PTR [rsp+200] + vmovdqu xmm11, OWORD PTR [rsp+216] + vmovdqu xmm14, OWORD PTR [rsp+232] + vmovdqu xmm13, OWORD PTR [rsp+248] + vmovdqu xmm12, OWORD PTR [rsp+264] + vmovdqu xmm15, OWORD PTR [rsp+280] + add rsp, 296 + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX2_RORX ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +Transform_Sha512_AVX2_RORX_Len PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov rdi, rcx + mov rsi, rdx + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm14 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm12 + vmovdqu OWORD PTR [rsp+144], xmm15 + test sil, 128 + je L_sha512_len_avx2_rorx_block + mov rax, QWORD PTR [rdi+224] + push rsi + vmovdqu ymm0, YMMWORD PTR [rax] + vmovdqu ymm1, YMMWORD PTR [rax+32] + vmovdqu ymm2, YMMWORD PTR [rax+64] + vmovdqu ymm3, YMMWORD PTR [rax+96] + vmovups YMMWORD PTR [rdi+64], ymm0 + vmovups YMMWORD PTR [rdi+96], ymm1 + vmovups YMMWORD PTR [rdi+128], ymm2 + vmovups YMMWORD PTR [rdi+160], ymm3 + call Transform_Sha512_AVX2_RORX + pop rsi + add QWORD PTR [rdi+224], 128 + sub esi, 128 + jz L_sha512_len_avx2_rorx_done +L_sha512_len_avx2_rorx_block: + sub rsp, 1352 + mov rax, QWORD PTR [rdi+224] + vmovdqu ymm15, YMMWORD PTR L_avx2_rorx_sha512_flip_mask + mov r8, QWORD PTR [rdi] + mov r9, QWORD PTR [rdi+8] + mov r10, QWORD PTR [rdi+16] + mov r11, QWORD PTR [rdi+24] + mov r12, QWORD PTR [rdi+32] + mov r13, QWORD PTR [rdi+40] + mov r14, QWORD PTR [rdi+48] + mov r15, QWORD PTR [rdi+56] + mov DWORD PTR [rsp+1344], esi + ; Start of loop processing two blocks +L_sha512_len_avx2_rorx_begin: + mov rsi, rsp + mov rbp, QWORD PTR [ptr_L_avx2_rorx_sha512_k_2] + mov rbx, r9 + xor rdx, rdx + vmovdqu xmm0, OWORD PTR [rax] + vmovdqu xmm1, OWORD PTR [rax+16] + vinserti128 ymm0, ymm0, OWORD PTR [rax+128], 1 + vinserti128 ymm1, ymm1, OWORD PTR [rax+144], 1 + vpshufb ymm0, ymm0, ymm15 + vpshufb ymm1, ymm1, ymm15 + vmovdqu xmm2, OWORD PTR [rax+32] + vmovdqu xmm3, OWORD PTR [rax+48] + vinserti128 ymm2, ymm2, OWORD PTR [rax+160], 1 + vinserti128 ymm3, ymm3, OWORD PTR [rax+176], 1 + vpshufb ymm2, ymm2, ymm15 + vpshufb ymm3, ymm3, ymm15 + vmovdqu xmm4, OWORD PTR [rax+64] + vmovdqu xmm5, OWORD PTR [rax+80] + vinserti128 ymm4, ymm4, OWORD PTR [rax+192], 1 + vinserti128 ymm5, ymm5, OWORD PTR [rax+208], 1 + vpshufb ymm4, ymm4, ymm15 + vpshufb ymm5, ymm5, ymm15 + vmovdqu xmm6, OWORD PTR [rax+96] + vmovdqu xmm7, OWORD PTR [rax+112] + vinserti128 ymm6, ymm6, OWORD PTR [rax+224], 1 + vinserti128 ymm7, ymm7, OWORD PTR [rax+240], 1 + vpshufb ymm6, ymm6, ymm15 + vpshufb ymm7, ymm7, ymm15 + xor rbx, r10 + ; Start of 16 rounds +L_sha512_len_avx2_rorx_start: + vpaddq ymm8, ymm0, [rbp] + vpaddq ymm9, ymm1, [rbp+32] + vmovdqu YMMWORD PTR [rsi], ymm8 + vmovdqu YMMWORD PTR [rsi+32], ymm9 + vpaddq ymm8, ymm2, [rbp+64] + vpaddq ymm9, ymm3, [rbp+96] + vmovdqu YMMWORD PTR [rsi+64], ymm8 + vmovdqu YMMWORD PTR [rsi+96], ymm9 + vpaddq ymm8, ymm4, [rbp+128] + vpaddq ymm9, ymm5, [rbp+160] + vmovdqu YMMWORD PTR [rsi+128], ymm8 + vmovdqu YMMWORD PTR [rsi+160], ymm9 + vpaddq ymm8, ymm6, [rbp+192] + vpaddq ymm9, ymm7, [rbp+224] + vmovdqu YMMWORD PTR [rsi+192], ymm8 + vmovdqu YMMWORD PTR [rsi+224], ymm9 + ; msg_sched: 0-1 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpalignr ymm12, ymm1, ymm0, 8 + add r15, QWORD PTR [rsi] + mov rdx, r13 + xor rcx, rax + vpalignr ymm13, ymm5, ymm4, 8 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm0, ymm13, ymm0 + and rbx, rdx + add r15, rax + xor rbx, r9 + vpaddq ymm0, ymm8, ymm0 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpsrlq ymm8, ymm7, 19 + vpsllq ymm9, ymm7, 45 + add r14, QWORD PTR [rsi+8] + mov rbx, r12 + xor rcx, rax + vpsrlq ymm10, ymm7, 61 + vpsllq ymm11, ymm7, 3 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm7, 6 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpaddq ymm0, ymm8, ymm0 + ; msg_sched done: 0-1 + ; msg_sched: 4-5 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpalignr ymm12, ymm2, ymm1, 8 + add r13, QWORD PTR [rsi+32] + mov rdx, r11 + xor rcx, rax + vpalignr ymm13, ymm6, ymm5, 8 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm1, ymm13, ymm1 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpaddq ymm1, ymm8, ymm1 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq ymm8, ymm0, 19 + vpsllq ymm9, ymm0, 45 + add r12, QWORD PTR [rsi+40] + mov rbx, r10 + xor rcx, rax + vpsrlq ymm10, ymm0, 61 + vpsllq ymm11, ymm0, 3 + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm0, 6 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r12, rax + xor rdx, r14 + vpaddq ymm1, ymm8, ymm1 + ; msg_sched done: 4-5 + ; msg_sched: 8-9 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpalignr ymm12, ymm3, ymm2, 8 + add r11, QWORD PTR [rsi+64] + mov rdx, r9 + xor rcx, rax + vpalignr ymm13, ymm7, ymm6, 8 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm2, ymm13, ymm2 + and rbx, rdx + add r11, rax + xor rbx, r13 + vpaddq ymm2, ymm8, ymm2 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpsrlq ymm8, ymm1, 19 + vpsllq ymm9, ymm1, 45 + add r10, QWORD PTR [rsi+72] + mov rbx, r8 + xor rcx, rax + vpsrlq ymm10, ymm1, 61 + vpsllq ymm11, ymm1, 3 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm1, 6 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpaddq ymm2, ymm8, ymm2 + ; msg_sched done: 8-9 + ; msg_sched: 12-13 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpalignr ymm12, ymm4, ymm3, 8 + add r9, QWORD PTR [rsi+96] + mov rdx, r15 + xor rcx, rax + vpalignr ymm13, ymm0, ymm7, 8 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm3, ymm13, ymm3 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpaddq ymm3, ymm8, ymm3 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq ymm8, ymm2, 19 + vpsllq ymm9, ymm2, 45 + add r8, QWORD PTR [rsi+104] + mov rbx, r14 + xor rcx, rax + vpsrlq ymm10, ymm2, 61 + vpsllq ymm11, ymm2, 3 + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm2, 6 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r8, rax + xor rdx, r10 + vpaddq ymm3, ymm8, ymm3 + ; msg_sched done: 12-13 + ; msg_sched: 16-17 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + vpalignr ymm12, ymm5, ymm4, 8 + add r15, QWORD PTR [rsi+128] + mov rdx, r13 + xor rcx, rax + vpalignr ymm13, ymm1, ymm0, 8 + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r9 + add r11, r15 + xor rdx, r8 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm4, ymm13, ymm4 + and rbx, rdx + add r15, rax + xor rbx, r9 + vpaddq ymm4, ymm8, ymm4 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + vpsrlq ymm8, ymm3, 19 + vpsllq ymm9, ymm3, 45 + add r14, QWORD PTR [rsi+136] + mov rbx, r12 + xor rcx, rax + vpsrlq ymm10, ymm3, 61 + vpsllq ymm11, ymm3, 3 + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm3, 6 + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r14, rax + xor rdx, r8 + vpaddq ymm4, ymm8, ymm4 + ; msg_sched done: 16-17 + ; msg_sched: 20-21 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + vpalignr ymm12, ymm6, ymm5, 8 + add r13, QWORD PTR [rsi+160] + mov rdx, r11 + xor rcx, rax + vpalignr ymm13, ymm2, ymm1, 8 + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r15 + add r9, r13 + xor rdx, r14 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm5, ymm13, ymm5 + and rbx, rdx + add r13, rax + xor rbx, r15 + vpaddq ymm5, ymm8, ymm5 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + vpsrlq ymm8, ymm4, 19 + vpsllq ymm9, ymm4, 45 + add r12, QWORD PTR [rsi+168] + mov rbx, r10 + xor rcx, rax + vpsrlq ymm10, ymm4, 61 + vpsllq ymm11, ymm4, 3 + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm4, 6 + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r12, rax + xor rdx, r14 + vpaddq ymm5, ymm8, ymm5 + ; msg_sched done: 20-21 + ; msg_sched: 24-25 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + vpalignr ymm12, ymm7, ymm6, 8 + add r11, QWORD PTR [rsi+192] + mov rdx, r9 + xor rcx, rax + vpalignr ymm13, ymm3, ymm2, 8 + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r13 + add r15, r11 + xor rdx, r12 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm6, ymm13, ymm6 + and rbx, rdx + add r11, rax + xor rbx, r13 + vpaddq ymm6, ymm8, ymm6 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + vpsrlq ymm8, ymm5, 19 + vpsllq ymm9, ymm5, 45 + add r10, QWORD PTR [rsi+200] + mov rbx, r8 + xor rcx, rax + vpsrlq ymm10, ymm5, 61 + vpsllq ymm11, ymm5, 3 + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm5, 6 + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r10, rax + xor rdx, r12 + vpaddq ymm6, ymm8, ymm6 + ; msg_sched done: 24-25 + ; msg_sched: 28-29 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + vpalignr ymm12, ymm0, ymm7, 8 + add r9, QWORD PTR [rsi+224] + mov rdx, r15 + xor rcx, rax + vpalignr ymm13, ymm4, ymm3, 8 + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + vpsrlq ymm8, ymm12, 1 + vpsllq ymm9, ymm12, 63 + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + vpsrlq ymm10, ymm12, 8 + vpsllq ymm11, ymm12, 56 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + vpsrlq ymm11, ymm12, 7 + vpxor ymm8, ymm8, ymm10 + mov rdx, r11 + add r13, r9 + xor rdx, r10 + vpxor ymm8, ymm8, ymm11 + vpaddq ymm7, ymm13, ymm7 + and rbx, rdx + add r9, rax + xor rbx, r11 + vpaddq ymm7, ymm8, ymm7 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + vpsrlq ymm8, ymm6, 19 + vpsllq ymm9, ymm6, 45 + add r8, QWORD PTR [rsi+232] + mov rbx, r14 + xor rcx, rax + vpsrlq ymm10, ymm6, 61 + vpsllq ymm11, ymm6, 3 + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + vpor ymm8, ymm8, ymm9 + vpor ymm10, ymm10, ymm11 + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + vpxor ymm8, ymm8, ymm10 + vpsrlq ymm11, ymm6, 6 + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + vpxor ymm8, ymm8, ymm11 + and rdx, rbx + add r8, rax + xor rdx, r10 + vpaddq ymm7, ymm8, ymm7 + ; msg_sched done: 28-29 + add rbp, 256 + add rsi, 256 + cmp rbp, QWORD PTR [L_avx2_rorx_sha512_k_2_end] + jne L_sha512_len_avx2_rorx_start + vpaddq ymm8, ymm0, [rbp] + vpaddq ymm9, ymm1, [rbp+32] + vmovdqu YMMWORD PTR [rsi], ymm8 + vmovdqu YMMWORD PTR [rsi+32], ymm9 + vpaddq ymm8, ymm2, [rbp+64] + vpaddq ymm9, ymm3, [rbp+96] + vmovdqu YMMWORD PTR [rsi+64], ymm8 + vmovdqu YMMWORD PTR [rsi+96], ymm9 + vpaddq ymm8, ymm4, [rbp+128] + vpaddq ymm9, ymm5, [rbp+160] + vmovdqu YMMWORD PTR [rsi+128], ymm8 + vmovdqu YMMWORD PTR [rsi+160], ymm9 + vpaddq ymm8, ymm6, [rbp+192] + vpaddq ymm9, ymm7, [rbp+224] + vmovdqu YMMWORD PTR [rsi+192], ymm8 + vmovdqu YMMWORD PTR [rsi+224], ymm9 + ; rnd_all_2: 0-1 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsi] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsi+8] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 4-5 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsi+32] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsi+40] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 8-9 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsi+64] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsi+72] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 12-13 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsi+96] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsi+104] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + ; rnd_all_2: 16-17 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsi+128] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsi+136] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 20-21 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsi+160] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsi+168] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 24-25 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsi+192] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsi+200] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 28-29 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsi+224] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsi+232] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + add r8, rdx + sub rsi, 1024 + add r8, QWORD PTR [rdi] + add r9, QWORD PTR [rdi+8] + add r10, QWORD PTR [rdi+16] + add r11, QWORD PTR [rdi+24] + add r12, QWORD PTR [rdi+32] + add r13, QWORD PTR [rdi+40] + add r14, QWORD PTR [rdi+48] + add r15, QWORD PTR [rdi+56] + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + mov QWORD PTR [rdi+32], r12 + mov QWORD PTR [rdi+40], r13 + mov QWORD PTR [rdi+48], r14 + mov QWORD PTR [rdi+56], r15 + mov rbx, r9 + xor rdx, rdx + xor rbx, r10 + mov rbp, 5 +L_sha512_len_avx2_rorx_tail: + ; rnd_all_2: 2-3 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsi+16] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsi+24] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 6-7 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsi+48] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsi+56] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 10-11 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsi+80] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsi+88] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 14-15 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsi+112] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsi+120] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + ; rnd_all_2: 18-19 + rorx rax, r12, 14 + rorx rcx, r12, 18 + add r8, rdx + add r15, QWORD PTR [rsi+144] + mov rdx, r13 + xor rcx, rax + xor rdx, r14 + rorx rax, r12, 41 + xor rax, rcx + and rdx, r12 + add r15, rax + rorx rax, r8, 28 + rorx rcx, r8, 34 + xor rdx, r14 + xor rcx, rax + rorx rax, r8, 39 + add r15, rdx + xor rax, rcx + mov rdx, r9 + add r11, r15 + xor rdx, r8 + and rbx, rdx + add r15, rax + xor rbx, r9 + rorx rax, r11, 14 + rorx rcx, r11, 18 + add r15, rbx + add r14, QWORD PTR [rsi+152] + mov rbx, r12 + xor rcx, rax + xor rbx, r13 + rorx rax, r11, 41 + xor rax, rcx + and rbx, r11 + add r14, rax + rorx rax, r15, 28 + rorx rcx, r15, 34 + xor rbx, r13 + xor rcx, rax + rorx rax, r15, 39 + add r14, rbx + xor rax, rcx + mov rbx, r8 + lea r10, QWORD PTR [r10+r14] + xor rbx, r15 + and rdx, rbx + add r14, rax + xor rdx, r8 + ; rnd_all_2: 22-23 + rorx rax, r10, 14 + rorx rcx, r10, 18 + add r14, rdx + add r13, QWORD PTR [rsi+176] + mov rdx, r11 + xor rcx, rax + xor rdx, r12 + rorx rax, r10, 41 + xor rax, rcx + and rdx, r10 + add r13, rax + rorx rax, r14, 28 + rorx rcx, r14, 34 + xor rdx, r12 + xor rcx, rax + rorx rax, r14, 39 + add r13, rdx + xor rax, rcx + mov rdx, r15 + add r9, r13 + xor rdx, r14 + and rbx, rdx + add r13, rax + xor rbx, r15 + rorx rax, r9, 14 + rorx rcx, r9, 18 + add r13, rbx + add r12, QWORD PTR [rsi+184] + mov rbx, r10 + xor rcx, rax + xor rbx, r11 + rorx rax, r9, 41 + xor rax, rcx + and rbx, r9 + add r12, rax + rorx rax, r13, 28 + rorx rcx, r13, 34 + xor rbx, r11 + xor rcx, rax + rorx rax, r13, 39 + add r12, rbx + xor rax, rcx + mov rbx, r14 + lea r8, QWORD PTR [r8+r12] + xor rbx, r13 + and rdx, rbx + add r12, rax + xor rdx, r14 + ; rnd_all_2: 26-27 + rorx rax, r8, 14 + rorx rcx, r8, 18 + add r12, rdx + add r11, QWORD PTR [rsi+208] + mov rdx, r9 + xor rcx, rax + xor rdx, r10 + rorx rax, r8, 41 + xor rax, rcx + and rdx, r8 + add r11, rax + rorx rax, r12, 28 + rorx rcx, r12, 34 + xor rdx, r10 + xor rcx, rax + rorx rax, r12, 39 + add r11, rdx + xor rax, rcx + mov rdx, r13 + add r15, r11 + xor rdx, r12 + and rbx, rdx + add r11, rax + xor rbx, r13 + rorx rax, r15, 14 + rorx rcx, r15, 18 + add r11, rbx + add r10, QWORD PTR [rsi+216] + mov rbx, r8 + xor rcx, rax + xor rbx, r9 + rorx rax, r15, 41 + xor rax, rcx + and rbx, r15 + add r10, rax + rorx rax, r11, 28 + rorx rcx, r11, 34 + xor rbx, r9 + xor rcx, rax + rorx rax, r11, 39 + add r10, rbx + xor rax, rcx + mov rbx, r12 + lea r14, QWORD PTR [r14+r10] + xor rbx, r11 + and rdx, rbx + add r10, rax + xor rdx, r12 + ; rnd_all_2: 30-31 + rorx rax, r14, 14 + rorx rcx, r14, 18 + add r10, rdx + add r9, QWORD PTR [rsi+240] + mov rdx, r15 + xor rcx, rax + xor rdx, r8 + rorx rax, r14, 41 + xor rax, rcx + and rdx, r14 + add r9, rax + rorx rax, r10, 28 + rorx rcx, r10, 34 + xor rdx, r8 + xor rcx, rax + rorx rax, r10, 39 + add r9, rdx + xor rax, rcx + mov rdx, r11 + add r13, r9 + xor rdx, r10 + and rbx, rdx + add r9, rax + xor rbx, r11 + rorx rax, r13, 14 + rorx rcx, r13, 18 + add r9, rbx + add r8, QWORD PTR [rsi+248] + mov rbx, r14 + xor rcx, rax + xor rbx, r15 + rorx rax, r13, 41 + xor rax, rcx + and rbx, r13 + add r8, rax + rorx rax, r9, 28 + rorx rcx, r9, 34 + xor rbx, r15 + xor rcx, rax + rorx rax, r9, 39 + add r8, rbx + xor rax, rcx + mov rbx, r10 + lea r12, QWORD PTR [r12+r8] + xor rbx, r9 + and rdx, rbx + add r8, rax + xor rdx, r10 + add rsi, 256 + sub rbp, 1 + jnz L_sha512_len_avx2_rorx_tail + add r8, rdx + add r8, QWORD PTR [rdi] + add r9, QWORD PTR [rdi+8] + add r10, QWORD PTR [rdi+16] + add r11, QWORD PTR [rdi+24] + add r12, QWORD PTR [rdi+32] + add r13, QWORD PTR [rdi+40] + add r14, QWORD PTR [rdi+48] + add r15, QWORD PTR [rdi+56] + mov rax, QWORD PTR [rdi+224] + add rax, 256 + sub DWORD PTR [rsp+1344], 256 + mov QWORD PTR [rdi+224], rax + mov QWORD PTR [rdi], r8 + mov QWORD PTR [rdi+8], r9 + mov QWORD PTR [rdi+16], r10 + mov QWORD PTR [rdi+24], r11 + mov QWORD PTR [rdi+32], r12 + mov QWORD PTR [rdi+40], r13 + mov QWORD PTR [rdi+48], r14 + mov QWORD PTR [rdi+56], r15 + jnz L_sha512_len_avx2_rorx_begin + add rsp, 1352 +L_sha512_len_avx2_rorx_done: + xor rax, rax + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm14, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm12, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +Transform_Sha512_AVX2_RORX_Len ENDP +_TEXT ENDS +ENDIF +END diff --git a/wolfcrypt/src/wc_mldsa_asm.S b/wolfcrypt/src/wc_mldsa_asm.S index e1e77a93783..db09680752f 100644 --- a/wolfcrypt/src/wc_mldsa_asm.S +++ b/wolfcrypt/src/wc_mldsa_asm.S @@ -22755,7 +22755,7 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 32(%rsi) # 3/32 - vperm2i128 $0x21, %ymm1, %ymm0, %ymm0 + vperm2i128 $33, %ymm1, %ymm0, %ymm0 vpermq $0xe9, %ymm0, %ymm4 vpshufb %ymm7, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22770,7 +22770,7 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 96(%rsi) # 5/32 - vperm2i128 $0x21, %ymm2, %ymm1, %ymm1 + vperm2i128 $33, %ymm2, %ymm1, %ymm1 vpermq $0x94, %ymm1, %ymm4 vpshufb %ymm9, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22792,7 +22792,7 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 192(%rsi) # 8/32 - vperm2i128 $0x21, %ymm3, %ymm2, %ymm2 + vperm2i128 $33, %ymm3, %ymm2, %ymm2 vpermq $0xe9, %ymm2, %ymm4 vpshufb %ymm12, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22809,7 +22809,7 @@ _wc_mldsa_decode_t0_avx2: # 10/32 vmovdqu 128(%rdi), %ymm0 vmovdqu 160(%rdi), %ymm1 - vperm2i128 $0x21, %ymm0, %ymm3, %ymm3 + vperm2i128 $33, %ymm0, %ymm3, %ymm3 vpermq $0x94, %ymm3, %ymm4 vpshufb %ymm6, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22831,7 +22831,7 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 352(%rsi) # 13/32 - vperm2i128 $0x21, %ymm1, %ymm0, %ymm0 + vperm2i128 $33, %ymm1, %ymm0, %ymm0 vpermq $0xe9, %ymm0, %ymm4 vpshufb %ymm9, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22848,7 +22848,7 @@ _wc_mldsa_decode_t0_avx2: # 15/32 vmovdqu 192(%rdi), %ymm2 vmovdqu 224(%rdi), %ymm3 - vperm2i128 $0x21, %ymm2, %ymm1, %ymm1 + vperm2i128 $33, %ymm2, %ymm1, %ymm1 vpermq $0x94, %ymm1, %ymm4 vpshufb %ymm11, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22870,7 +22870,7 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 512(%rsi) # 18/32 - vperm2i128 $0x21, %ymm3, %ymm2, %ymm2 + vperm2i128 $33, %ymm3, %ymm2, %ymm2 vpermq $0xe9, %ymm2, %ymm4 vpshufb %ymm6, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22887,7 +22887,7 @@ _wc_mldsa_decode_t0_avx2: # 20/32 vmovdqu 256(%rdi), %ymm0 vmovdqu 288(%rdi), %ymm1 - vperm2i128 $0x21, %ymm0, %ymm3, %ymm3 + vperm2i128 $33, %ymm0, %ymm3, %ymm3 vpermq $0x94, %ymm3, %ymm4 vpshufb %ymm8, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22909,7 +22909,7 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 672(%rsi) # 23/32 - vperm2i128 $0x21, %ymm1, %ymm0, %ymm0 + vperm2i128 $33, %ymm1, %ymm0, %ymm0 vpermq $0xe9, %ymm0, %ymm4 vpshufb %ymm11, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22926,7 +22926,7 @@ _wc_mldsa_decode_t0_avx2: # 25/32 vmovdqu 320(%rdi), %ymm2 vmovdqu 352(%rdi), %ymm3 - vperm2i128 $0x21, %ymm2, %ymm1, %ymm1 + vperm2i128 $33, %ymm2, %ymm1, %ymm1 vpermq $0x99, %ymm1, %ymm4 vpshufb %ymm5, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22941,14 +22941,14 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 800(%rsi) # 27/32 - vpermq $0x3e, %ymm2, %ymm4 + vpermq $62, %ymm2, %ymm4 vpshufb %ymm7, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 vpand %ymm14, %ymm4, %ymm4 vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 832(%rsi) # 28/32 - vperm2i128 $0x21, %ymm3, %ymm2, %ymm2 + vperm2i128 $33, %ymm3, %ymm2, %ymm2 vpermq $0xe9, %ymm2, %ymm4 vpshufb %ymm8, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22965,7 +22965,7 @@ _wc_mldsa_decode_t0_avx2: # 30/32 vmovdqu 384(%rdi), %ymm0 vmovdqu 416(%rdi), %ymm1 - vperm2i128 $0x21, %ymm0, %ymm3, %ymm3 + vperm2i128 $33, %ymm0, %ymm3, %ymm3 vpermq $0x99, %ymm3, %ymm4 vpshufb %ymm10, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 @@ -22980,7 +22980,7 @@ _wc_mldsa_decode_t0_avx2: vpsubd %ymm4, %ymm15, %ymm4 vmovdqu %ymm4, 960(%rsi) # 32/32 - vpermq $0x3e, %ymm0, %ymm4 + vpermq $62, %ymm0, %ymm4 vpshufb %ymm12, %ymm4, %ymm4 vpsrlvd %ymm13, %ymm4, %ymm4 vpand %ymm14, %ymm4, %ymm4 @@ -23115,14 +23115,14 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 32(%rsi) # 3/32 - vpermq $0x3e, %ymm0, %ymm4 + vpermq $62, %ymm0, %ymm4 vpshufb %ymm7, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 vpand %ymm10, %ymm4, %ymm4 vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 64(%rsi) # 4/32 - vperm2i128 $0x21, %ymm1, %ymm0, %ymm0 + vperm2i128 $33, %ymm1, %ymm0, %ymm0 vpermq $0xe9, %ymm0, %ymm4 vpshufb %ymm8, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23144,7 +23144,7 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 160(%rsi) # 7/32 - vperm2i128 $0x21, %ymm2, %ymm1, %ymm1 + vperm2i128 $33, %ymm2, %ymm1, %ymm1 vpermq $0xe9, %ymm1, %ymm4 vpshufb %ymm7, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23166,7 +23166,7 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 256(%rsi) # 10/32 - vperm2i128 $0x21, %ymm3, %ymm2, %ymm2 + vperm2i128 $33, %ymm3, %ymm2, %ymm2 vpermq $0x99, %ymm2, %ymm4 vpshufb %ymm6, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23190,7 +23190,7 @@ _wc_mldsa_decode_t1_avx2: # 13/32 vmovdqu 128(%rdi), %ymm0 vmovdqu 160(%rdi), %ymm1 - vperm2i128 $0x21, %ymm0, %ymm3, %ymm3 + vperm2i128 $33, %ymm0, %ymm3, %ymm3 vpermq $0x99, %ymm3, %ymm4 vpshufb %ymm5, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23212,7 +23212,7 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 448(%rsi) # 16/32 - vpermq $0x3e, %ymm0, %ymm4 + vpermq $62, %ymm0, %ymm4 vpshufb %ymm8, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 vpand %ymm10, %ymm4, %ymm4 @@ -23233,7 +23233,7 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 544(%rsi) # 19/32 - vpermq $0x3e, %ymm1, %ymm4 + vpermq $62, %ymm1, %ymm4 vpshufb %ymm7, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 vpand %ymm10, %ymm4, %ymm4 @@ -23242,7 +23242,7 @@ _wc_mldsa_decode_t1_avx2: # 20/32 vmovdqu 192(%rdi), %ymm2 vmovdqu 224(%rdi), %ymm3 - vperm2i128 $0x21, %ymm2, %ymm1, %ymm1 + vperm2i128 $33, %ymm2, %ymm1, %ymm1 vpermq $0xe9, %ymm1, %ymm4 vpshufb %ymm8, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23264,7 +23264,7 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 672(%rsi) # 23/32 - vperm2i128 $0x21, %ymm3, %ymm2, %ymm2 + vperm2i128 $33, %ymm3, %ymm2, %ymm2 vpermq $0xe9, %ymm2, %ymm4 vpshufb %ymm7, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23288,7 +23288,7 @@ _wc_mldsa_decode_t1_avx2: # 26/32 vmovdqu 256(%rdi), %ymm0 vmovdqu 288(%rdi), %ymm1 - vperm2i128 $0x21, %ymm0, %ymm3, %ymm3 + vperm2i128 $33, %ymm0, %ymm3, %ymm3 vpermq $0x99, %ymm3, %ymm4 vpshufb %ymm6, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23310,7 +23310,7 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 864(%rsi) # 29/32 - vperm2i128 $0x21, %ymm1, %ymm0, %ymm0 + vperm2i128 $33, %ymm1, %ymm0, %ymm0 vpermq $0x99, %ymm0, %ymm4 vpshufb %ymm5, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 @@ -23332,7 +23332,7 @@ _wc_mldsa_decode_t1_avx2: vpslld $13, %ymm4, %ymm4 vmovdqu %ymm4, 960(%rsi) # 32/32 - vpermq $0x3e, %ymm1, %ymm4 + vpermq $62, %ymm1, %ymm4 vpshufb %ymm8, %ymm4, %ymm4 vpsrlvd %ymm9, %ymm4, %ymm4 vpand %ymm10, %ymm4, %ymm4 diff --git a/wolfcrypt/src/wc_mldsa_asm.asm b/wolfcrypt/src/wc_mldsa_asm.asm new file mode 100644 index 00000000000..16ad868c23f --- /dev/null +++ b/wolfcrypt/src/wc_mldsa_asm.asm @@ -0,0 +1,34618 @@ +; /* wc_mldsa_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +IFDEF WOLFSSL_HAVE_MLDSA +IFDEF HAVE_INTEL_AVX2 +_DATA SEGMENT +ALIGN 16 +mldsa_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h + DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h +ptr_mldsa_q QWORD mldsa_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +mldsa_qinv DWORD 03802001h, 03802001h, 03802001h, 03802001h + DWORD 03802001h, 03802001h, 03802001h, 03802001h +ptr_mldsa_qinv QWORD mldsa_qinv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +mldsa_v DWORD 00400000h, 00400000h, 00400000h, 00400000h + DWORD 00400000h, 00400000h, 00400000h, 00400000h +ptr_mldsa_v QWORD mldsa_v +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_avx2_zetas DWORD 000064f7h, 000064f7h, 000064f7h, 000064f7h + DWORD 000064f7h, 000064f7h, 000064f7h, 000064f7h + DWORD 6d1f44f7h, 6d1f44f7h, 6d1f44f7h, 6d1f44f7h + DWORD 6d1f44f7h, 6d1f44f7h, 6d1f44f7h, 6d1f44f7h + DWORD 0ffd83102h, 0ffd83102h, 0ffd83102h, 0ffd83102h + DWORD 0ffd83102h, 0ffd83102h, 0ffd83102h, 0ffd83102h + DWORD 8cf87102h, 8cf87102h, 8cf87102h, 8cf87102h + DWORD 8cf87102h, 8cf87102h, 8cf87102h, 8cf87102h + DWORD 0fff81503h, 0fff81503h, 0fff81503h, 0fff81503h + DWORD 0fff81503h, 0fff81503h, 0fff81503h, 0fff81503h + DWORD 8d187503h, 8d187503h, 8d187503h, 8d187503h + DWORD 8d187503h, 8d187503h, 8d187503h, 8d187503h + DWORD 00039e44h, 00039e44h, 00039e44h, 00039e44h + DWORD 00039e44h, 00039e44h, 00039e44h, 00039e44h + DWORD 61cc1e44h, 61cc1e44h, 61cc1e44h, 61cc1e44h + DWORD 61cc1e44h, 61cc1e44h, 61cc1e44h, 61cc1e44h + DWORD 001bde2bh, 001bde2bh, 001bde2bh, 001bde2bh + DWORD 001bde2bh, 001bde2bh, 001bde2bh, 001bde2bh + DWORD 12613e2bh, 12613e2bh, 12613e2bh, 12613e2bh + DWORD 12613e2bh, 12613e2bh, 12613e2bh, 12613e2bh + DWORD 0023e92bh, 0023e92bh, 0023e92bh, 0023e92bh + DWORD 0023e92bh, 0023e92bh, 0023e92bh, 0023e92bh + DWORD 93c9492bh, 93c9492bh, 93c9492bh, 93c9492bh + DWORD 93c9492bh, 93c9492bh, 93c9492bh, 93c9492bh + DWORD 00299658h, 00299658h, 00299658h, 00299658h + DWORD 00299658h, 00299658h, 00299658h, 00299658h + DWORD 66f49658h, 66f49658h, 66f49658h, 66f49658h + DWORD 66f49658h, 66f49658h, 66f49658h, 66f49658h + DWORD 000fa070h, 000fa070h, 000fa070h, 000fa070h + DWORD 000fa070h, 000fa070h, 000fa070h, 000fa070h + DWORD 7c1da070h, 7c1da070h, 7c1da070h, 7c1da070h + DWORD 7c1da070h, 7c1da070h, 7c1da070h, 7c1da070h + DWORD 0ffef85a4h, 0ffef85a4h, 0ffef85a4h, 0ffef85a4h + DWORD 0ffef85a4h, 0ffef85a4h, 0ffef85a4h, 0ffef85a4h + DWORD 0aea405a4h, 0aea405a4h, 0aea405a4h, 0aea405a4h + DWORD 0aea405a4h, 0aea405a4h, 0aea405a4h, 0aea405a4h + DWORD 0036b788h, 0036b788h, 0036b788h, 0036b788h + DWORD 0036b788h, 0036b788h, 0036b788h, 0036b788h + DWORD 3327b788h, 3327b788h, 3327b788h, 3327b788h + DWORD 3327b788h, 3327b788h, 3327b788h, 3327b788h + DWORD 00294a67h, 00294a67h, 00294a67h, 00294a67h + DWORD 00017620h, 00017620h, 00017620h, 00017620h + DWORD 91f62a67h, 91f62a67h, 91f62a67h, 91f62a67h + DWORD 9ec57620h, 9ec57620h, 9ec57620h, 9ec57620h + DWORD 002ef4cdh, 002ef4cdh, 002ef4cdh, 002ef4cdh + DWORD 0035dec5h, 0035dec5h, 0035dec5h, 0035dec5h + DWORD 0ac4894cdh, 0ac4894cdh, 0ac4894cdh, 0ac4894cdh + DWORD 6d8e7ec5h, 6d8e7ec5h, 6d8e7ec5h, 6d8e7ec5h + DWORD 0ffc406e5h, 0ffc406e5h, 0ffe8ac81h, 0ffe8ac81h + DWORD 0ffc7e1cfh, 0ffc7e1cfh, 0ffd19819h, 0ffd19819h + DWORD 0a220a6e5h, 0a220a6e5h, 0d8f8cc81h, 0d8f8cc81h + DWORD 5081c1cfh, 5081c1cfh, 8a54b819h, 8a54b819h + DWORD 0ffe9d65dh, 0ffe9d65dh, 003509eeh, 003509eeh + DWORD 002135c7h, 002135c7h, 0ffe7cfbbh, 0ffe7cfbbh + DWORD 8035765dh, 8035765dh, 6272c9eeh, 6272c9eeh + DWORD 5f5a15c7h, 5f5a15c7h, 085f2fbbh, 085f2fbbh + DWORD 0ffe6a503h, 0ffe6a503h, 0ffe6a503h, 0ffe6a503h + DWORD 0ffc9302ch, 0ffc9302ch, 0ffc9302ch, 0ffc9302ch + DWORD 5f070503h, 5f070503h, 5f070503h, 5f070503h + DWORD 0bfceb02ch, 0bfceb02ch, 0bfceb02ch, 0bfceb02ch + DWORD 0ffd947d4h, 0ffd947d4h, 0ffd947d4h, 0ffd947d4h + DWORD 003bbeafh, 003bbeafh, 003bbeafh, 003bbeafh + DWORD 8ed3c7d4h, 8ed3c7d4h, 8ed3c7d4h, 8ed3c7d4h + DWORD 0dc919eafh, 0dc919eafh, 0dc919eafh, 0dc919eafh + DWORD 0ffeccf75h, 0ffeccf75h, 001d9772h, 001d9772h + DWORD 0ffc1b072h, 0ffc1b072h, 0fff0bcf6h, 0fff0bcf6h + DWORD 0b35b6f75h, 0b35b6f75h, 0c20bd772h, 0c20bd772h + DWORD 0c4cff072h, 0c4cff072h, 748f7cf6h, 748f7cf6h + DWORD 0ffcf5280h, 0ffcf5280h, 0ffcfd2aeh, 0ffcfd2aeh + DWORD 0ffc890e0h, 0ffc890e0h, 0001efcah, 0001efcah + DWORD 0aa1f5280h, 0aa1f5280h, 5b2592aeh, 5b2592aeh + DWORD 21e490e0h, 21e490e0h, 80fb2fcah, 80fb2fcah + DWORD 001fea93h, 0033ff5ah, 002358d4h, 003a41f8h + DWORD 0ffccff72h, 00223dfbh, 0ffdaab9fh, 0ffc9a422h + DWORD 0fff24a93h, 3b1f3f5ah, 513dd8d4h, 2c7941f8h + DWORD 0aebb3f72h, 36619dfbh, 01ce8b9fh, 0ab4de422h + DWORD 000412f5h, 00252587h, 0ffed24f0h, 00359b5dh + DWORD 0ffca48a0h, 0ffc6a2fch, 0ffedbb56h, 0ffcf45deh + DWORD 0dbe2b2f5h, 0fd560587h, 0ec8b24f0h, 79213b5dh + DWORD 78de48a0h, 462622fch, 64587b56h, 718b05deh + DWORD 000dbe5eh, 001c5e1ah, 000de0e6h, 000c7f5ah + DWORD 00078f83h, 0ffe7628ah, 0ffff5704h, 0fff806fch + DWORD 00d97e5eh, 0e6df9e1ah, 0e12aa0e6h, 4af7bf5ah + DWORD 3c77ef83h, 0cf38a28ah, 78dfd704h, 72d786fch + DWORD 0fff60021h, 0ffd05af6h, 001f0084h, 0030ef86h + DWORD 0ffc9b97dh, 0fff7fcd6h, 0fff44592h, 0ffc921c2h + DWORD 337a2021h, 682f1af6h, 0ae2f8084h, 7321af86h + DWORD 6c79597dh, 0ec92bcd6h, 07a68592h, 4b0161c2h + DWORD 0fff42118h, 0fff42118h, 0fff42118h, 0fff42118h + DWORD 0fff42118h, 0fff42118h, 0fff42118h, 0fff42118h + DWORD 58172118h, 58172118h, 58172118h, 58172118h + DWORD 58172118h, 58172118h, 58172118h, 58172118h + DWORD 0fffa84adh, 0fffa84adh, 0fffa84adh, 0fffa84adh + DWORD 0fffa84adh, 0fffa84adh, 0fffa84adh, 0fffa84adh + DWORD 0ae1024adh, 0ae1024adh, 0ae1024adh, 0ae1024adh + DWORD 0ae1024adh, 0ae1024adh, 0ae1024adh, 0ae1024adh + DWORD 0ffe0147fh, 0ffe0147fh, 0ffe0147fh, 0ffe0147fh + DWORD 0ffe0147fh, 0ffe0147fh, 0ffe0147fh, 0ffe0147fh + DWORD 0beeff47fh, 0beeff47fh, 0beeff47fh, 0beeff47fh + DWORD 0beeff47fh, 0beeff47fh, 0beeff47fh, 0beeff47fh + DWORD 0fff79d90h, 0fff79d90h, 0fff79d90h, 0fff79d90h + DWORD 0fff79d90h, 0fff79d90h, 0fff79d90h, 0fff79d90h + DWORD 6ba99d90h, 6ba99d90h, 6ba99d90h, 6ba99d90h + DWORD 6ba99d90h, 6ba99d90h, 6ba99d90h, 6ba99d90h + DWORD 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h + DWORD 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h, 0ffeeeaa0h + DWORD 0d42eaa0h, 0d42eaa0h, 0d42eaa0h, 0d42eaa0h + DWORD 0d42eaa0h, 0d42eaa0h, 0d42eaa0h, 0d42eaa0h + DWORD 0027f968h, 0027f968h, 0027f968h, 0027f968h + DWORD 0027f968h, 0027f968h, 0027f968h, 0027f968h + DWORD 0eb54f968h, 0eb54f968h, 0eb54f968h, 0eb54f968h + DWORD 0eb54f968h, 0eb54f968h, 0eb54f968h, 0eb54f968h + DWORD 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh + DWORD 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh, 0ffdfd37bh + DWORD 28cf337bh, 28cf337bh, 28cf337bh, 28cf337bh + DWORD 28cf337bh, 28cf337bh, 28cf337bh, 28cf337bh + DWORD 0ffc51585h, 0ffc51585h, 0ffc51585h, 0ffc51585h + DWORD 0ffd18e7ch, 0ffd18e7ch, 0ffd18e7ch, 0ffd18e7ch + DWORD 0f3f5b585h, 0f3f5b585h, 0f3f5b585h, 0f3f5b585h + DWORD 0e3a10e7ch, 0e3a10e7ch, 0e3a10e7ch, 0e3a10e7ch + DWORD 00368a96h, 00368a96h, 00368a96h, 00368a96h + DWORD 0ffd43e41h, 0ffd43e41h, 0ffd43e41h, 0ffd43e41h + DWORD 0de894a96h, 0de894a96h, 0de894a96h, 0de894a96h + DWORD 6b1c5e41h, 6b1c5e41h, 6b1c5e41h, 6b1c5e41h + DWORD 003410f2h, 003410f2h, 0fff0fe85h, 0fff0fe85h + DWORD 0020c638h, 0020c638h, 00296e9fh, 00296e9fh + DWORD 0d15250f2h, 0d15250f2h, 0f1419e85h, 0f1419e85h + DWORD 0dce7c638h, 0dce7c638h, 5a7d4e9fh, 5a7d4e9fh + DWORD 0ffd2b7a3h, 0ffd2b7a3h, 0ffc7a44bh, 0ffc7a44bh + DWORD 0fff9ba6dh, 0fff9ba6dh, 0ffda3409h, 0ffda3409h + DWORD 114717a3h, 114717a3h, 0fad1044bh, 0fad1044bh + DWORD 0b4c75a6dh, 0b4c75a6dh, 65db5409h, 65db5409h + DWORD 00360400h, 00360400h, 00360400h, 00360400h + DWORD 0fffb6a4dh, 0fffb6a4dh, 0fffb6a4dh, 0fffb6a4dh + DWORD 0c0b60400h, 0c0b60400h, 0c0b60400h, 0c0b60400h + DWORD 7ac50a4dh, 7ac50a4dh, 7ac50a4dh, 7ac50a4dh + DWORD 0023d69ch, 0023d69ch, 0023d69ch, 0023d69ch + DWORD 0fff7c55dh, 0fff7c55dh, 0fff7c55dh, 0fff7c55dh + DWORD 9cf7569ch, 9cf7569ch, 9cf7569ch, 9cf7569ch + DWORD 0be23655dh, 0be23655dh, 0be23655dh, 0be23655dh + DWORD 0fff5c282h, 0fff5c282h, 0ffed4113h, 0ffed4113h + DWORD 0ffffa63bh, 0ffffa63bh, 0ffec09f7h, 0ffec09f7h + DWORD 7f460282h, 7f460282h, 6a8fa113h, 6a8fa113h + DWORD 0c347063bh, 0c347063bh, 61aae9f7h, 61aae9f7h + DWORD 0fffa2bddh, 0fffa2bddh, 001495d4h, 001495d4h + DWORD 001c4563h, 001c4563h, 0ffea2c62h, 0ffea2c62h + DWORD 0caf5cbddh, 0caf5cbddh, 0f8cf15d4h, 0f8cf15d4h + DWORD 6348a563h, 6348a563h, 9c766c62h, 9c766c62h + DWORD 00053919h, 0004610ch, 0ffdacd41h, 003eb01bh + DWORD 003472e7h, 0ffcd003bh, 001a7cc7h, 00031924h + DWORD 7ea85919h, 3625e10ch, 0bd02ed41h, 34c2101bh + DWORD 0b71152e7h, 6e54603bh, 08335cc7h, 61279924h + DWORD 002b5ee5h, 00291199h, 0ffd87a3ah, 00134d71h + DWORD 003de11ch, 00130984h, 0025f051h, 00185a46h + DWORD 8d87fee5h, 0b9dc3199h, 0da1fba3ah, 75416d71h + DWORD 9e61611ch, 0af438984h, 0d9b01051h, 00611a46h + DWORD 0ffc68518h, 001314beh, 00283891h, 0ffc9db90h + DWORD 0ffd25089h, 001c853fh, 001d0b4bh, 0ffeff6a6h + DWORD 0a4698518h, 0fbaad4beh, 02ba5891h, 0b33bdb90h + DWORD 29637089h, 0ed44653fh, 28066b4bh, 43c4b6a6h + DWORD 0ffeba8beh, 0012e11bh, 0ffcd5e3eh, 0ffea2d2fh + DWORD 0fff91de4h, 001406c7h, 00327283h, 0ffe20d6eh + DWORD 0e0368beh, 3ab6411bh, 84951e3eh, 6a100d2fh + DWORD 0c1b59de4h, 396ce6c7h, 1902d283h, 428fcd6eh + DWORD 0fff2a128h, 0fff2a128h, 0fff2a128h, 0fff2a128h + DWORD 0fff2a128h, 0fff2a128h, 0fff2a128h, 0fff2a128h + DWORD 6017a128h, 6017a128h, 6017a128h, 6017a128h + DWORD 6017a128h, 6017a128h, 6017a128h, 6017a128h + DWORD 002f9a75h, 002f9a75h, 002f9a75h, 002f9a75h + DWORD 002f9a75h, 002f9a75h, 002f9a75h, 002f9a75h + DWORD 8cfe3a75h, 8cfe3a75h, 8cfe3a75h, 8cfe3a75h + DWORD 8cfe3a75h, 8cfe3a75h, 8cfe3a75h, 8cfe3a75h + DWORD 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h + DWORD 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h, 0ffd3fb09h + DWORD 1eb51b09h, 1eb51b09h, 1eb51b09h, 1eb51b09h + DWORD 1eb51b09h, 1eb51b09h, 1eb51b09h, 1eb51b09h + DWORD 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h + DWORD 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h, 0ffdfadd6h + DWORD 629a6dd6h, 629a6dd6h, 629a6dd6h, 629a6dd6h + DWORD 629a6dd6h, 629a6dd6h, 629a6dd6h, 629a6dd6h + DWORD 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h + DWORD 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h, 0ffc51ae7h + DWORD 0cba1fae7h, 0cba1fae7h, 0cba1fae7h, 0cba1fae7h + DWORD 0cba1fae7h, 0cba1fae7h, 0cba1fae7h, 0cba1fae7h + DWORD 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h + DWORD 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h, 0ffeaa4f7h + DWORD 0b50984f7h, 0b50984f7h, 0b50984f7h, 0b50984f7h + DWORD 0b50984f7h, 0b50984f7h, 0b50984f7h, 0b50984f7h + DWORD 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h + DWORD 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h, 0ffcdfc98h + DWORD 0d360fc98h, 0d360fc98h, 0d360fc98h, 0d360fc98h + DWORD 0d360fc98h, 0d360fc98h, 0d360fc98h, 0d360fc98h + DWORD 0ffe6123dh, 0ffe6123dh, 0ffe6123dh, 0ffe6123dh + DWORD 0ffe6ead6h, 0ffe6ead6h, 0ffe6ead6h, 0ffe6ead6h + DWORD 97adb23dh, 97adb23dh, 97adb23dh, 97adb23dh + DWORD 0ca41aad6h, 0ca41aad6h, 0ca41aad6h, 0ca41aad6h + DWORD 00357e1eh, 00357e1eh, 00357e1eh, 00357e1eh + DWORD 0ffc5af59h, 0ffc5af59h, 0ffc5af59h, 0ffc5af59h + DWORD 18f93e1eh, 18f93e1eh, 18f93e1eh, 18f93e1eh + DWORD 6d30cf59h, 6d30cf59h, 6d30cf59h, 6d30cf59h + DWORD 0ffccfbe9h, 0ffccfbe9h, 00040af0h, 00040af0h + DWORD 0007c417h, 0007c417h, 002f4588h, 002f4588h + DWORD 4eca1be9h, 4eca1be9h, 0c9620af0h, 0c9620af0h + DWORD 490aa417h, 490aa417h, 44e04588h, 44e04588h + DWORD 0000ad00h, 0000ad00h, 0ffef36beh, 0ffef36beh + DWORD 000dcd44h, 000dcd44h, 003c675ah, 003c675ah + DWORD 95a0ad00h, 95a0ad00h, 7fc6f6beh, 7fc6f6beh + DWORD 27b64d44h, 27b64d44h, 4827a75ah, 4827a75ah + DWORD 0035843fh, 0035843fh, 0035843fh, 0035843fh + DWORD 0ffdf5617h, 0ffdf5617h, 0ffdf5617h, 0ffdf5617h + DWORD 8d3d643fh, 8d3d643fh, 8d3d643fh, 8d3d643fh + DWORD 3b223617h, 3b223617h, 3b223617h, 3b223617h + DWORD 0ffe7945ch, 0ffe7945ch, 0ffe7945ch, 0ffe7945ch + DWORD 0038738ch, 0038738ch, 0038738ch, 0038738ch + DWORD 3473145ch, 3473145ch, 3473145ch, 3473145ch + DWORD 78a9f38ch, 78a9f38ch, 78a9f38ch, 78a9f38ch + DWORD 0ffc72bcah, 0ffc72bcah, 0ffffde7eh, 0ffffde7eh + DWORD 00193948h, 00193948h, 0ffce69c0h, 0ffce69c0h + DWORD 28406bcah, 28406bcah, 0b4cf9e7eh, 0b4cf9e7eh + DWORD 0a3423948h, 0a3423948h, 0ed0669c0h, 0ed0669c0h + DWORD 0024756ch, 0024756ch, 0fffcc7dfh, 0fffcc7dfh + DWORD 000b98a1h, 000b98a1h, 0ffebe808h, 0ffebe808h + DWORD 88d1f56ch, 88d1f56ch, 2578a7dfh, 2578a7dfh + DWORD 0a69fb8a1h, 0a69fb8a1h, 98ece808h, 98ece808h + DWORD 0ffec7953h, 001d4099h, 0ffd92578h, 0ffeb05adh + DWORD 0016e405h, 000bdbe7h, 00221de8h, 0033f8cfh + DWORD 3196d953h, 0bfb06099h, 48882578h, 3e20a5adh + DWORD 0ee178405h, 2408bbe7h, 0efdf1de8h, 53cdd8cfh + DWORD 0fff7b934h, 0ffd4ca0ch, 0ffe67ff8h, 0ffe3d157h + DWORD 0ffd8911bh, 0ffc72c12h, 000910d8h, 0ffc65e1fh + DWORD 2d1e3934h, 0c3164a0ch, 0b3e57ff8h, 2a8eb157h + DWORD 0f07bf11bh, 24496c12h, 162410d8h, 380a3e1fh + DWORD 0ffe14658h, 00251d8bh, 002573b7h, 0fffd7c8fh + DWORD 001ddd98h, 00336898h, 0002d4bbh, 0ffed93a7h + DWORD 5cac4658h, 0a567d8bh, 0af1c53b7h, 0a40f5c8fh + DWORD 4fd0dd98h, 81466898h, 0e91a34bbh, 7ae273a7h + DWORD 0ffcf6cbeh, 00027c1ch, 0018aa08h, 002dfd71h + DWORD 000c5ca5h, 0019379ah, 0ffc7a167h, 0ffe48c3dh + DWORD 86672cbeh, 0b185fc1ch, 3159aa08h, 0cb5c1d71h + DWORD 0cd20fca5h, 0c20c779ah, 0dc748167h, 66ec2c3dh + DWORD 00071e24h, 00071e24h, 00071e24h, 00071e24h + DWORD 00071e24h, 00071e24h, 00071e24h, 00071e24h + DWORD 61cb9e24h, 61cb9e24h, 61cb9e24h, 61cb9e24h + DWORD 61cb9e24h, 61cb9e24h, 61cb9e24h, 61cb9e24h + DWORD 002f7a49h, 002f7a49h, 002f7a49h, 002f7a49h + DWORD 002f7a49h, 002f7a49h, 002f7a49h, 002f7a49h + DWORD 0eef89a49h, 0eef89a49h, 0eef89a49h, 0eef89a49h + DWORD 0eef89a49h, 0eef89a49h, 0eef89a49h, 0eef89a49h + DWORD 0028e527h, 0028e527h, 0028e527h, 0028e527h + DWORD 0028e527h, 0028e527h, 0028e527h, 0028e527h + DWORD 254dc527h, 254dc527h, 254dc527h, 254dc527h + DWORD 254dc527h, 254dc527h, 254dc527h, 254dc527h + DWORD 001ad035h, 001ad035h, 001ad035h, 001ad035h + DWORD 001ad035h, 001ad035h, 001ad035h, 001ad035h + DWORD 13a17035h, 13a17035h, 13a17035h, 13a17035h + DWORD 13a17035h, 13a17035h, 13a17035h, 13a17035h + DWORD 0ffffb422h, 0ffffb422h, 0ffffb422h, 0ffffb422h + DWORD 0ffffb422h, 0ffffb422h, 0ffffb422h, 0ffffb422h + DWORD 6d83f422h, 6d83f422h, 6d83f422h, 6d83f422h + DWORD 6d83f422h, 6d83f422h, 6d83f422h, 6d83f422h + DWORD 003d3201h, 003d3201h, 003d3201h, 003d3201h + DWORD 003d3201h, 003d3201h, 003d3201h, 003d3201h + DWORD 0a9fd5201h, 0a9fd5201h, 0a9fd5201h, 0a9fd5201h + DWORD 0a9fd5201h, 0a9fd5201h, 0a9fd5201h, 0a9fd5201h + DWORD 000445c5h, 000445c5h, 000445c5h, 000445c5h + DWORD 000445c5h, 000445c5h, 000445c5h, 000445c5h + DWORD 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h + DWORD 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h, 0ba3ce5c5h + DWORD 000c63a8h, 000c63a8h, 000c63a8h, 000c63a8h + DWORD 00081b9ah, 00081b9ah, 00081b9ah, 00081b9ah + DWORD 588163a8h, 588163a8h, 588163a8h, 588163a8h + DWORD 9e7b5b9ah, 9e7b5b9ah, 9e7b5b9ah, 9e7b5b9ah + DWORD 000e8f76h, 000e8f76h, 000e8f76h, 000e8f76h + DWORD 003b3853h, 003b3853h, 003b3853h, 003b3853h + DWORD 0eefd4f76h, 0eefd4f76h, 0eefd4f76h, 0eefd4f76h + DWORD 89c59853h, 89c59853h, 89c59853h, 89c59853h + DWORD 0002e46ch, 0002e46ch, 0ffc9c808h, 0ffc9c808h + DWORD 003036c2h, 003036c2h, 0ffe3bff6h, 0ffe3bff6h + DWORD 0d690646ch, 0d690646ch, 54cac808h, 54cac808h + DWORD 0ae0876c2h, 0ae0876c2h, 54e27ff6h, 54e27ff6h + DWORD 0ffdb3c93h, 0ffdb3c93h, 0fffd4ae0h, 0fffd4ae0h + DWORD 00141305h, 00141305h, 00147792h, 00147792h + DWORD 69ed9c93h, 69ed9c93h, 0b9594ae0h, 0b9594ae0h + DWORD 13f4b305h, 13f4b305h, 0e06b792h, 0e06b792h + DWORD 003b8534h, 003b8534h, 003b8534h, 003b8534h + DWORD 0ffd8fc30h, 0ffd8fc30h, 0ffd8fc30h, 0ffd8fc30h + DWORD 0a6e20534h, 0a6e20534h, 0a6e20534h, 0a6e20534h + DWORD 0c75efc30h, 0c75efc30h, 0c75efc30h, 0c75efc30h + DWORD 001f9d54h, 001f9d54h, 001f9d54h, 001f9d54h + DWORD 0ffd54f2dh, 0ffd54f2dh, 0ffd54f2dh, 0ffd54f2dh + DWORD 99ca1d54h, 99ca1d54h, 99ca1d54h, 99ca1d54h + DWORD 0c73aef2dh, 0c73aef2dh, 0c73aef2dh, 0c73aef2dh + DWORD 00139e25h, 00139e25h, 0ffe7d0e0h, 0ffe7d0e0h + DWORD 0fff39944h, 0fff39944h, 0ffea0802h, 0ffea0802h + DWORD 0f5583e25h, 0f5583e25h, 0a03d0e0h, 0a03d0e0h + DWORD 0e11c1944h, 0e11c1944h, 47ea4802h, 47ea4802h + DWORD 0ffd1eea2h, 0ffd1eea2h, 0ffc4c79ch, 0ffc4c79ch + DWORD 0ffc8a057h, 0ffc8a057h, 003a97d9h, 003a97d9h + DWORD 74a62ea2h, 74a62ea2h, 3ab8479ch, 3ab8479ch + DWORD 44538057h, 44538057h, 0cab5b7d9h, 0cab5b7d9h + DWORD 0ffd1a13ch, 0035c539h, 003b0115h, 00041dc0h + DWORD 0021c4f7h, 0fff11bf4h, 001a35e7h, 0007340eh + DWORD 85f9213ch, 005ce539h, 29dda115h, 0a3bc1dc0h + DWORD 9940a4f7h, 0f96f9bf4h, 0ef5715e7h, 1788f40eh + DWORD 0fff97d45h, 001a4cd0h, 0ffe47caeh, 001d2668h + DWORD 0ffe68e98h, 0ffef2633h, 0fffc05dah, 0ffc57fdbh + DWORD 0a1221d45h, 21b44cd0h, 0f07a3caeh, 10ea2668h + DWORD 0e5b98e98h, 97358633h, 0fbb745dah, 2e40dfdbh + DWORD 0ffd32764h, 0ffdde1afh, 0fff993ddh, 0ffdd1d09h + DWORD 0002cc93h, 0fff11805h, 00189c2ah, 0ffc9e5a9h + DWORD 42bfa764h, 0a093c1afh, 0b7f533ddh, 42fe3d09h + DWORD 5c152c93h, 3471b805h, 0a69ddc2ah, 0bff05a9h + DWORD 0fff78a50h, 003bcf2ch, 0ffff434eh, 0ffeb36dfh + DWORD 003c15cah, 00155e68h, 0fff316b6h, 001e29ceh + DWORD 09418a50h, 94214f2ch, 7969034eh, 734716dfh + DWORD 0c5f555cah, 17e25e68h, 0dfc9d6b6h, 1657e9ceh +ptr_L_mldsa_avx2_zetas QWORD L_mldsa_avx2_zetas +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_avx2_zetas_basemul DWORD 0ffc406e5h, 0ffe9d65dh, 003cf91bh, 001729a3h + DWORD 0ffe8ac81h, 003509eeh, 0018537fh, 0ffcbf612h + DWORD 0ffc7e1cfh, 002135c7h, 00391e31h, 0ffdfca39h + DWORD 0ffd19819h, 0ffe7cfbbh, 002f67e7h, 00193045h + DWORD 0000a6e5h, 0000765dh, 0000591bh, 000089a3h + DWORD 0000cc81h, 0000c9eeh, 0000337fh, 00003612h + DWORD 0000c1cfh, 000015c7h, 00003e31h, 0000ea39h + DWORD 0000b819h, 00002fbbh, 000047e7h, 0000d045h + DWORD 0ffeccf75h, 0ffcf5280h, 0014308bh, 0031ad80h + DWORD 001d9772h, 0ffcfd2aeh, 0ffe3688eh, 00312d52h + DWORD 0ffc1b072h, 0ffc890e0h, 003f4f8eh, 00386f20h + DWORD 0fff0bcf6h, 0001efcah, 0010430ah, 0ffff1036h + DWORD 00006f75h, 00005280h, 0000908bh, 0000ad80h + DWORD 0000d772h, 000092aeh, 0000288eh, 00006d52h + DWORD 0000f072h, 000090e0h, 00000f8eh, 00006f20h + DWORD 00007cf6h, 00002fcah, 0000830ah, 0000d036h + DWORD 003410f2h, 0ffd2b7a3h, 0ffccef0eh, 002e485dh + DWORD 0fff0fe85h, 0ffc7a44bh, 0010017bh, 00395bb5h + DWORD 0020c638h, 0fff9ba6dh, 0ffe039c8h, 00074593h + DWORD 00296e9fh, 0ffda3409h, 0ffd79161h, 0026cbf7h + DWORD 000050f2h, 000017a3h, 0000af0eh, 0000e85dh + DWORD 00009e85h, 0000044bh, 0000617bh, 0000fbb5h + DWORD 0000c638h, 00005a6dh, 000039c8h, 0000a593h + DWORD 00004e9fh, 00005409h, 0000b161h, 0000abf7h + DWORD 0fff5c282h, 0fffa2bddh, 000b3d7eh, 0006d423h + DWORD 0ffed4113h, 001495d4h, 0013beedh, 0ffec6a2ch + DWORD 0ffffa63bh, 001c4563h, 000159c5h, 0ffe4ba9dh + DWORD 0ffec09f7h, 0ffea2c62h, 0014f609h, 0016d39eh + DWORD 00000282h, 0000cbddh, 0000fd7eh, 00003423h + DWORD 0000a113h, 000015d4h, 00005eedh, 0000ea2ch + DWORD 0000063bh, 0000a563h, 0000f9c5h, 00005a9dh + DWORD 0000e9f7h, 00006c62h, 00001609h, 0000939eh + DWORD 0ffccfbe9h, 0000ad00h, 00340417h, 00005300h + DWORD 00040af0h, 0ffef36beh, 0fffcf510h, 0011c942h + DWORD 0007c417h, 000dcd44h, 0fff93be9h, 0fff332bch + DWORD 002f4588h, 003c675ah, 0ffd1ba78h, 0ffc498a6h + DWORD 00001be9h, 0000ad00h, 0000e417h, 00005300h + DWORD 00000af0h, 0000f6beh, 0000f510h, 00000942h + DWORD 0000a417h, 00004d44h, 00005be9h, 0000b2bch + DWORD 00004588h, 0000a75ah, 0000ba78h, 000058a6h + DWORD 0ffc72bcah, 0024756ch, 0039d436h, 0ffdc8a94h + DWORD 0ffffde7eh, 0fffcc7dfh, 00012182h, 00043821h + DWORD 00193948h, 000b98a1h, 0ffe7c6b8h, 0fff5675fh + DWORD 0ffce69c0h, 0ffebe808h, 00329640h, 001517f8h + DWORD 00006bcah, 0000f56ch, 00009436h, 00000a94h + DWORD 00009e7eh, 0000a7dfh, 00006182h, 00005821h + DWORD 00003948h, 0000b8a1h, 0000c6b8h, 0000475fh + DWORD 000069c0h, 0000e808h, 00009640h, 000017f8h + DWORD 0002e46ch, 0ffdb3c93h, 0fffe1b94h, 0025c36dh + DWORD 0ffc9c808h, 0fffd4ae0h, 003737f8h, 0003b520h + DWORD 003036c2h, 00141305h, 0ffd0c93eh, 0ffececfbh + DWORD 0ffe3bff6h, 00147792h, 001d400ah, 0ffec886eh + DWORD 0000646ch, 00009c93h, 00009b94h, 0000636dh + DWORD 0000c808h, 00004ae0h, 000037f8h, 0000b520h + DWORD 000076c2h, 0000b305h, 0000893eh, 00004cfbh + DWORD 00007ff6h, 0000b792h, 0000800ah, 0000486eh + DWORD 00139e25h, 0ffd1eea2h, 0ffed61dbh, 002f115eh + DWORD 0ffe7d0e0h, 0ffc4c79ch, 00192f20h, 003c3864h + DWORD 0fff39944h, 0ffc8a057h, 000d66bch, 00385fa9h + DWORD 0ffea0802h, 003a97d9h, 0016f7feh, 0ffc66827h + DWORD 00003e25h, 00002ea2h, 0000c1dbh, 0000d15eh + DWORD 0000d0e0h, 0000479ch, 00002f20h, 0000b864h + DWORD 00001944h, 00008057h, 0000e6bch, 00007fa9h + DWORD 00004802h, 0000b7d9h, 0000b7feh, 00004827h +ptr_L_mldsa_avx2_zetas_basemul QWORD L_mldsa_avx2_zetas_basemul +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_avx2_zetas_1 DWORD 0ffc97e01h, 0ffc97e01h, 0ffc97e01h, 0ffc97e01h + DWORD 0ffc97e01h, 0ffc97e01h, 0ffc97e01h, 0ffc97e01h +ptr_L_mldsa_avx2_zetas_1 QWORD L_mldsa_avx2_zetas_1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_avx2_zetas_inv DWORD 0ffe1d632h, 000ce94ah, 0ffeaa198h, 0ffc3ea36h + DWORD 0014c921h, 0000bcb2h, 0ffc430d4h, 000875b0h + DWORD 0e9a81632h, 2036294ah, 0e81da198h, 3a0aaa36h + DWORD 8cb8e921h, 8696fcb2h, 6bdeb0d4h, 0f6be75b0h + DWORD 00361a57h, 0ffe763d6h, 000ee7fbh, 0fffd336dh + DWORD 0022e2f7h, 00066c23h, 00221e51h, 002cd89ch + DWORD 0f400fa57h, 596223d6h, 0cb8e47fbh, 0a3ead36dh + DWORD 0bd01c2f7h, 480acc23h, 5f6c3e51h, 0bd40589ch + DWORD 0ffc56827h, 0ffc56827h, 00375fa9h, 00375fa9h + DWORD 003b3864h, 003b3864h, 002e115eh, 002e115eh + DWORD 354a4827h, 354a4827h, 0bbac7fa9h, 0bbac7fa9h + DWORD 0c547b864h, 0c547b864h, 8b59d15eh, 8b59d15eh + DWORD 0015f7feh, 0015f7feh, 000c66bch, 000c66bch + DWORD 00182f20h, 00182f20h, 0ffec61dbh, 0ffec61dbh + DWORD 0b815b7feh, 0b815b7feh, 1ee3e6bch, 1ee3e6bch + DWORD 0f5fc2f20h, 0f5fc2f20h, 0aa7c1dbh, 0aa7c1dbh + DWORD 002ab0d3h, 002ab0d3h, 002ab0d3h, 002ab0d3h + DWORD 0ffe062ach, 0ffe062ach, 0ffe062ach, 0ffe062ach + DWORD 38c510d3h, 38c510d3h, 38c510d3h, 38c510d3h + DWORD 6635e2ach, 6635e2ach, 6635e2ach, 6635e2ach + DWORD 002703d0h, 002703d0h, 002703d0h, 002703d0h + DWORD 0ffc47acch, 0ffc47acch, 0ffc47acch, 0ffc47acch + DWORD 38a103d0h, 38a103d0h, 38a103d0h, 38a103d0h + DWORD 591dfacch, 591dfacch, 591dfacch, 591dfacch + DWORD 0fffbba3bh, 0fffbba3bh, 0fffbba3bh, 0fffbba3bh + DWORD 0fffbba3bh, 0fffbba3bh, 0fffbba3bh, 0fffbba3bh + DWORD 45c31a3bh, 45c31a3bh, 45c31a3bh, 45c31a3bh + DWORD 45c31a3bh, 45c31a3bh, 45c31a3bh, 45c31a3bh + DWORD 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh + DWORD 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh, 0ffc2cdffh + DWORD 5602adffh, 5602adffh, 5602adffh, 5602adffh + DWORD 5602adffh, 5602adffh, 5602adffh, 5602adffh + DWORD 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h + DWORD 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h, 0ffd71ad9h + DWORD 0dab23ad9h, 0dab23ad9h, 0dab23ad9h, 0dab23ad9h + DWORD 0dab23ad9h, 0dab23ad9h, 0dab23ad9h, 0dab23ad9h + DWORD 003a8025h, 0003fa26h, 0010d9cdh, 00197168h + DWORD 0ffe2d998h, 001b8352h, 0ffe5b330h, 000682bbh + DWORD 0d1bf2025h, 0448ba26h, 68ca79cdh, 1a467168h + DWORD 0ef15d998h, 0f85c352h, 0de4bb330h, 5edde2bbh + DWORD 0fff8cbf2h, 0ffe5ca19h, 000ee40ch, 0ffde3b09h + DWORD 0fffbe240h, 0ffc4feebh, 0ffca3ac7h, 002e5ec4h + DWORD 0e8770bf2h, 10a8ea19h, 0690640ch, 66bf5b09h + DWORD 5c43e240h, 0d6225eebh, 0ffa31ac7h, 7a06dec4h + DWORD 0ffeb886eh, 0ffeb886eh, 0ffebecfbh, 0ffebecfbh + DWORD 0002b520h, 0002b520h, 0024c36dh, 0024c36dh + DWORD 0f1f9486eh, 0f1f9486eh, 0ec0b4cfbh, 0ec0b4cfbh + DWORD 46a6b520h, 46a6b520h, 9612636dh, 9612636dh + DWORD 001c400ah, 001c400ah, 0ffcfc93eh, 0ffcfc93eh + DWORD 003637f8h, 003637f8h, 0fffd1b94h, 0fffd1b94h + DWORD 0ab1d800ah, 0ab1d800ah, 51f7893eh, 51f7893eh + DWORD 0ab3537f8h, 0ab3537f8h, 296f9b94h, 296f9b94h + DWORD 0ffc4c7adh, 0ffc4c7adh, 0ffc4c7adh, 0ffc4c7adh + DWORD 0fff1708ah, 0fff1708ah, 0fff1708ah, 0fff1708ah + DWORD 763a67adh, 763a67adh, 763a67adh, 763a67adh + DWORD 1102b08ah, 1102b08ah, 1102b08ah, 1102b08ah + DWORD 0fff7e466h, 0fff7e466h, 0fff7e466h, 0fff7e466h + DWORD 0fff39c58h, 0fff39c58h, 0fff39c58h, 0fff39c58h + DWORD 6184a466h, 6184a466h, 6184a466h, 6184a466h + DWORD 0a77e9c58h, 0a77e9c58h, 0a77e9c58h, 0a77e9c58h + DWORD 00004bdeh, 00004bdeh, 00004bdeh, 00004bdeh + DWORD 00004bdeh, 00004bdeh, 00004bdeh, 00004bdeh + DWORD 927c0bdeh, 927c0bdeh, 927c0bdeh, 927c0bdeh + DWORD 927c0bdeh, 927c0bdeh, 927c0bdeh, 927c0bdeh + DWORD 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh + DWORD 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh, 0ffe52fcbh + DWORD 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh + DWORD 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh, 0ec5e8fcbh + DWORD 0ffd085b7h, 0ffd085b7h, 0ffd085b7h, 0ffd085b7h + DWORD 0ffd085b7h, 0ffd085b7h, 0ffd085b7h, 0ffd085b7h + DWORD 110765b7h, 110765b7h, 110765b7h, 110765b7h + DWORD 110765b7h, 110765b7h, 110765b7h, 110765b7h + DWORD 0fff8e1dch, 0fff8e1dch, 0fff8e1dch, 0fff8e1dch + DWORD 0fff8e1dch, 0fff8e1dch, 0fff8e1dch, 0fff8e1dch + DWORD 9e3461dch, 9e3461dch, 9e3461dch, 9e3461dch + DWORD 9e3461dch, 9e3461dch, 9e3461dch, 9e3461dch + DWORD 001b73c3h, 00385e99h, 0ffe6c866h, 0fff3a35bh + DWORD 0ffd2028fh, 0ffe755f8h, 0fffd83e4h, 00309342h + DWORD 9913d3c3h, 238b7e99h, 3df38866h, 32df035bh + DWORD 34a3e28fh, 0cea655f8h, 4e7a03e4h, 7998d342h + DWORD 00126c59h, 0fffd2b45h, 0ffcc9768h, 0ffe22268h + DWORD 00028371h, 0ffda8c49h, 0ffdae275h, 001eb9a8h + DWORD 851d8c59h, 16e5cb45h, 7eb99768h, 0b02f2268h + DWORD 5bf0a371h, 50e3ac49h, 0f5a98275h, 0a353b9a8h + DWORD 001417f8h, 001417f8h, 0fff4675fh, 0fff4675fh + DWORD 00033821h, 00033821h, 0ffdb8a94h, 0ffdb8a94h + DWORD 671317f8h, 671317f8h, 5960475fh, 5960475fh + DWORD 0da875821h, 0da875821h, 772e0a94h, 772e0a94h + DWORD 00319640h, 00319640h, 0ffe6c6b8h, 0ffe6c6b8h + DWORD 00002182h, 00002182h, 0038d436h, 0038d436h + DWORD 12f99640h, 12f99640h, 5cbdc6b8h, 5cbdc6b8h + DWORD 4b306182h, 4b306182h, 0d7bf9436h, 0d7bf9436h + DWORD 0ffc78c74h, 0ffc78c74h, 0ffc78c74h, 0ffc78c74h + DWORD 00186ba4h, 00186ba4h, 00186ba4h, 00186ba4h + DWORD 87560c74h, 87560c74h, 87560c74h, 87560c74h + DWORD 0cb8ceba4h, 0cb8ceba4h, 0cb8ceba4h, 0cb8ceba4h + DWORD 0020a9e9h, 0020a9e9h, 0020a9e9h, 0020a9e9h + DWORD 0ffca7bc1h, 0ffca7bc1h, 0ffca7bc1h, 0ffca7bc1h + DWORD 0c4ddc9e9h, 0c4ddc9e9h, 0c4ddc9e9h, 0c4ddc9e9h + DWORD 72c29bc1h, 72c29bc1h, 72c29bc1h, 72c29bc1h + DWORD 00320368h, 00320368h, 00320368h, 00320368h + DWORD 00320368h, 00320368h, 00320368h, 00320368h + DWORD 2c9f0368h, 2c9f0368h, 2c9f0368h, 2c9f0368h + DWORD 2c9f0368h, 2c9f0368h, 2c9f0368h, 2c9f0368h + DWORD 00155b09h, 00155b09h, 00155b09h, 00155b09h + DWORD 00155b09h, 00155b09h, 00155b09h, 00155b09h + DWORD 4af67b09h, 4af67b09h, 4af67b09h, 4af67b09h + DWORD 4af67b09h, 4af67b09h, 4af67b09h, 4af67b09h + DWORD 002c04f7h, 002c04f7h, 002c04f7h, 002c04f7h + DWORD 002c04f7h, 002c04f7h, 002c04f7h, 002c04f7h + DWORD 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h + DWORD 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h, 0e14ae4f7h + DWORD 0039a1e1h, 0fff6ef28h, 0038d3eeh, 00276ee5h + DWORD 001c2ea9h, 00198008h, 002b35f4h, 000846cch + DWORD 0c7f5c1e1h, 0e9dbef28h, 0dbb693eeh, 0f840ee5h + DWORD 0d5714ea9h, 4c1a8008h, 3ce9b5f4h, 0d2e1c6cch + DWORD 0ffcc0731h, 0ffdde218h, 0fff42419h, 0ffe91bfbh + DWORD 0014fa53h, 0026da88h, 0ffe2bf67h, 001386adh + DWORD 0ac322731h, 1020e218h, 0dbf74419h, 11e87bfbh + DWORD 0c1df5a53h, 0b777da88h, 404f9f67h, 0ce6926adh + DWORD 0ffc398a6h, 0ffc398a6h, 0fff232bch, 0fff232bch + DWORD 0010c942h, 0010c942h, 0ffff5300h, 0ffff5300h + DWORD 0b7d858a6h, 0b7d858a6h, 0d849b2bch, 0d849b2bch + DWORD 80390942h, 80390942h, 6a5f5300h, 6a5f5300h + DWORD 0ffd0ba78h, 0ffd0ba78h, 0fff83be9h, 0fff83be9h + DWORD 0fffbf510h, 0fffbf510h, 00330417h, 00330417h + DWORD 0bb1fba78h, 0bb1fba78h, 0b6f55be9h, 0b6f55be9h + DWORD 369df510h, 369df510h, 0b135e417h, 0b135e417h + DWORD 003a50a7h, 003a50a7h, 003a50a7h, 003a50a7h + DWORD 0ffca81e2h, 0ffca81e2h, 0ffca81e2h, 0ffca81e2h + DWORD 92cf30a7h, 92cf30a7h, 92cf30a7h, 92cf30a7h + DWORD 0e706c1e2h, 0e706c1e2h, 0e706c1e2h, 0e706c1e2h + DWORD 0019152ah, 0019152ah, 0019152ah, 0019152ah + DWORD 0019edc3h, 0019edc3h, 0019edc3h, 0019edc3h + DWORD 35be552ah, 35be552ah, 35be552ah, 35be552ah + DWORD 68524dc3h, 68524dc3h, 68524dc3h, 68524dc3h + DWORD 003ae519h, 003ae519h, 003ae519h, 003ae519h + DWORD 003ae519h, 003ae519h, 003ae519h, 003ae519h + DWORD 345e0519h, 345e0519h, 345e0519h, 345e0519h + DWORD 345e0519h, 345e0519h, 345e0519h, 345e0519h + DWORD 0020522ah, 0020522ah, 0020522ah, 0020522ah + DWORD 0020522ah, 0020522ah, 0020522ah, 0020522ah + DWORD 9d65922ah, 9d65922ah, 9d65922ah, 9d65922ah + DWORD 9d65922ah, 9d65922ah, 9d65922ah, 9d65922ah + DWORD 0ffd0658bh, 0ffd0658bh, 0ffd0658bh, 0ffd0658bh + DWORD 0ffd0658bh, 0ffd0658bh, 0ffd0658bh, 0ffd0658bh + DWORD 7301c58bh, 7301c58bh, 7301c58bh, 7301c58bh + DWORD 7301c58bh, 7301c58bh, 7301c58bh, 7301c58bh + DWORD 000d5ed8h, 000d5ed8h, 000d5ed8h, 000d5ed8h + DWORD 000d5ed8h, 000d5ed8h, 000d5ed8h, 000d5ed8h + DWORD 9fe85ed8h, 9fe85ed8h, 9fe85ed8h, 9fe85ed8h + DWORD 9fe85ed8h, 9fe85ed8h, 9fe85ed8h, 9fe85ed8h + DWORD 001df292h, 0ffcd8d7dh, 0ffebf939h, 0006e21ch + DWORD 0015d2d1h, 0032a1c2h, 0ffed1ee5h, 00145742h + DWORD 0bd703292h, 0e6fd2d7dh, 0c6931939h, 3e4a621ch + DWORD 95eff2d1h, 7b6ae1c2h, 0c549bee5h, 0f1fc9742h + DWORD 0010095ah, 0ffe2f4b5h, 0ffe37ac1h, 002daf77h + DWORD 00362470h, 0ffd7c76fh, 0ffeceb42h, 00397ae8h + DWORD 0bc3b495ah, 0d7f994b5h, 12bb9ac1h, 0d69c8f77h + DWORD 4cc42470h, 0fd45a76fh, 04552b42h, 5b967ae8h + DWORD 0015d39eh, 0015d39eh, 0ffe3ba9dh, 0ffe3ba9dh + DWORD 0ffeb6a2ch, 0ffeb6a2ch, 0005d423h, 0005d423h + DWORD 6389939eh, 6389939eh, 9cb75a9dh, 9cb75a9dh + DWORD 0730ea2ch, 0730ea2ch, 350a3423h, 350a3423h + DWORD 0013f609h, 0013f609h, 000059c5h, 000059c5h + DWORD 0012beedh, 0012beedh, 000a3d7eh, 000a3d7eh + DWORD 9e551609h, 9e551609h, 3cb8f9c5h, 3cb8f9c5h + DWORD 95705eedh, 95705eedh, 80b9fd7eh, 80b9fd7eh + DWORD 00083aa3h, 00083aa3h, 00083aa3h, 00083aa3h + DWORD 0ffdc2964h, 0ffdc2964h, 0ffdc2964h, 0ffdc2964h + DWORD 41dc9aa3h, 41dc9aa3h, 41dc9aa3h, 41dc9aa3h + DWORD 6308a964h, 6308a964h, 6308a964h, 6308a964h + DWORD 000495b3h, 000495b3h, 000495b3h, 000495b3h + DWORD 0ffc9fc00h, 0ffc9fc00h, 0ffc9fc00h, 0ffc9fc00h + DWORD 853af5b3h, 853af5b3h, 853af5b3h, 853af5b3h + DWORD 3f49fc00h, 3f49fc00h, 3f49fc00h, 3f49fc00h + DWORD 00202c85h, 00202c85h, 00202c85h, 00202c85h + DWORD 00202c85h, 00202c85h, 00202c85h, 00202c85h + DWORD 0d730cc85h, 0d730cc85h, 0d730cc85h, 0d730cc85h + DWORD 0d730cc85h, 0d730cc85h, 0d730cc85h, 0d730cc85h + DWORD 0ffd80698h, 0ffd80698h, 0ffd80698h, 0ffd80698h + DWORD 0ffd80698h, 0ffd80698h, 0ffd80698h, 0ffd80698h + DWORD 14ab0698h, 14ab0698h, 14ab0698h, 14ab0698h + DWORD 14ab0698h, 14ab0698h, 14ab0698h, 14ab0698h + DWORD 001feb81h, 001feb81h, 001feb81h, 001feb81h + DWORD 001feb81h, 001feb81h, 001feb81h, 001feb81h + DWORD 41100b81h, 41100b81h, 41100b81h, 41100b81h + DWORD 41100b81h, 41100b81h, 41100b81h, 41100b81h + DWORD 0ffe7a5bah, 0ffda0fafh, 0ffecf67ch, 0ffc21ee4h + DWORD 0ffecb28fh, 002785c6h, 0ffd6ee67h, 0ffd4a11bh + DWORD 0ff9ee5bah, 264fefafh, 50bc767ch, 619e9ee4h + DWORD 8abe928fh, 25e045c6h, 4623ce67h, 7278011bh + DWORD 0fffce6dch, 0ffe58339h, 0032ffc5h, 0ffcb8d19h + DWORD 0ffc14fe5h, 002532bfh, 0fffb9ef4h, 0fffac6e7h + DWORD 9ed866dch, 0f7cca339h, 91ab9fc5h, 48eead19h + DWORD 0cb3defe5h, 42fd12bfh, 0c9da1ef4h, 8157a6e7h + DWORD 0025cbf7h, 0025cbf7h, 00064593h, 00064593h + DWORD 00385bb5h, 00385bb5h, 002d485dh, 002d485dh + DWORD 9a24abf7h, 9a24abf7h, 4b38a593h, 4b38a593h + DWORD 052efbb5h, 052efbb5h, 0eeb8e85dh, 0eeb8e85dh + DWORD 0ffd69161h, 0ffd69161h, 0ffdf39c8h, 0ffdf39c8h + DWORD 000f017bh, 000f017bh, 0ffcbef0eh, 0ffcbef0eh + DWORD 0a582b161h, 0a582b161h, 231839c8h, 231839c8h + DWORD 0ebe617bh, 0ebe617bh, 2eadaf0eh, 2eadaf0eh + DWORD 002bc1bfh, 002bc1bfh, 002bc1bfh, 002bc1bfh + DWORD 0ffc9756ah, 0ffc9756ah, 0ffc9756ah, 0ffc9756ah + DWORD 94e3a1bfh, 94e3a1bfh, 94e3a1bfh, 94e3a1bfh + DWORD 2176b56ah, 2176b56ah, 2176b56ah, 2176b56ah + DWORD 002e7184h, 002e7184h, 002e7184h, 002e7184h + DWORD 003aea7bh, 003aea7bh, 003aea7bh, 003aea7bh + DWORD 1c5ef184h, 1c5ef184h, 1c5ef184h, 1c5ef184h + DWORD 0c0a4a7bh, 0c0a4a7bh, 0c0a4a7bh, 0c0a4a7bh + DWORD 00111560h, 00111560h, 00111560h, 00111560h + DWORD 00111560h, 00111560h, 00111560h, 00111560h + DWORD 0f2bd1560h, 0f2bd1560h, 0f2bd1560h, 0f2bd1560h + DWORD 0f2bd1560h, 0f2bd1560h, 0f2bd1560h, 0f2bd1560h + DWORD 00086270h, 00086270h, 00086270h, 00086270h + DWORD 00086270h, 00086270h, 00086270h, 00086270h + DWORD 94566270h, 94566270h, 94566270h, 94566270h + DWORD 94566270h, 94566270h, 94566270h, 94566270h + DWORD 00057b53h, 00057b53h, 00057b53h, 00057b53h + DWORD 00057b53h, 00057b53h, 00057b53h, 00057b53h + DWORD 51efdb53h, 51efdb53h, 51efdb53h, 51efdb53h + DWORD 51efdb53h, 51efdb53h, 51efdb53h, 51efdb53h + DWORD 000bdee8h, 000bdee8h, 000bdee8h, 000bdee8h + DWORD 000bdee8h, 000bdee8h, 000bdee8h, 000bdee8h + DWORD 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h + DWORD 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h, 0a7e8dee8h + DWORD 0036de3eh, 000bba6eh, 0008032ah, 00364683h + DWORD 0ffcf107ah, 0ffe0ff7ch, 002fa50ah, 0009ffdfh + DWORD 0b4fe9e3eh, 0f8597a6eh, 136d432ah, 9386a683h + DWORD 8cde507ah, 51d07f7ch, 97d0e50ah, 0cc85dfdfh + DWORD 0007f904h, 0000a8fch, 00189d76h, 0fff8707dh + DWORD 0fff380a6h, 0fff21f1ah, 0ffe3a1e6h, 0fff241a2h + DWORD 8d287904h, 872028fch, 30c75d76h, 0c388107dh + DWORD 0b50840a6h, 1ed55f1ah, 192061e6h, 0ff2681a2h + DWORD 0fffe1036h, 0fffe1036h, 00376f20h, 00376f20h + DWORD 00302d52h, 00302d52h, 0030ad80h, 0030ad80h + DWORD 7f04d036h, 7f04d036h, 0de1b6f20h, 0de1b6f20h + DWORD 0a4da6d52h, 0a4da6d52h, 55e0ad80h, 55e0ad80h + DWORD 000f430ah, 000f430ah, 003e4f8eh, 003e4f8eh + DWORD 0ffe2688eh, 0ffe2688eh, 0013308bh, 0013308bh + DWORD 8b70830ah, 8b70830ah, 3b300f8eh, 3b300f8eh + DWORD 3df4288eh, 3df4288eh, 4ca4908bh, 4ca4908bh + DWORD 0ffc44151h, 0ffc44151h, 0ffc44151h, 0ffc44151h + DWORD 0026b82ch, 0026b82ch, 0026b82ch, 0026b82ch + DWORD 236e6151h, 236e6151h, 236e6151h, 236e6151h + DWORD 712c382ch, 712c382ch, 712c382ch, 712c382ch + DWORD 0036cfd4h, 0036cfd4h, 0036cfd4h, 0036cfd4h + DWORD 00195afdh, 00195afdh, 00195afdh, 00195afdh + DWORD 40314fd4h, 40314fd4h, 40314fd4h, 40314fd4h + DWORD 0a0f8fafdh, 0a0f8fafdh, 0a0f8fafdh, 0a0f8fafdh + DWORD 0ffc94878h, 0ffc94878h, 0ffc94878h, 0ffc94878h + DWORD 0ffc94878h, 0ffc94878h, 0ffc94878h, 0ffc94878h + DWORD 0ccd84878h, 0ccd84878h, 0ccd84878h, 0ccd84878h + DWORD 0ccd84878h, 0ccd84878h, 0ccd84878h, 0ccd84878h + DWORD 00107a5ch, 00107a5ch, 00107a5ch, 00107a5ch + DWORD 00107a5ch, 00107a5ch, 00107a5ch, 00107a5ch + DWORD 515bfa5ch, 515bfa5ch, 515bfa5ch, 515bfa5ch + DWORD 515bfa5ch, 515bfa5ch, 515bfa5ch, 515bfa5ch + DWORD 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h + DWORD 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h, 0ffdc16d5h + DWORD 6c36b6d5h, 6c36b6d5h, 6c36b6d5h, 6c36b6d5h + DWORD 6c36b6d5h, 6c36b6d5h, 6c36b6d5h, 6c36b6d5h + DWORD 0030ba22h, 001244aah, 00395d04h, 0035b760h + DWORD 0ffca64a3h, 0012db10h, 0ffdada79h, 0fffbed0bh + DWORD 8e74fa22h, 9ba784aah, 0b9d9dd04h, 8721b760h + DWORD 86dec4a3h, 1374db10h, 02a9fa79h, 241d4d0bh + DWORD 00365bdeh, 00255461h, 0ffddc205h, 0033008eh + DWORD 0ffc5be08h, 0ffdca72ch, 0ffcc00a6h, 0ffe0156dh + DWORD 54b21bdeh, 0fe317461h, 0c99e6205h, 5144c08eh + DWORD 0d386be08h, 0aec2272ch, 0c4e0c0a6h, 000db56dh + DWORD 00183045h, 00183045h, 0ffdeca39h, 0ffdeca39h + DWORD 0ffcaf612h, 0ffcaf612h, 001629a3h, 001629a3h + DWORD 0f7a0d045h, 0f7a0d045h, 0a0a5ea39h, 0a0a5ea39h + DWORD 9d8d3612h, 9d8d3612h, 7fca89a3h, 7fca89a3h + DWORD 002e67e7h, 002e67e7h, 00381e31h, 00381e31h + DWORD 0017537fh, 0017537fh, 003bf91bh, 003bf91bh + DWORD 75ab47e7h, 75ab47e7h, 0af7e3e31h, 0af7e3e31h + DWORD 2707337fh, 2707337fh, 5ddf591bh, 5ddf591bh + DWORD 0ffca213bh, 0ffca213bh, 0ffca213bh, 0ffca213bh + DWORD 0ffd10b33h, 0ffd10b33h, 0ffd10b33h, 0ffd10b33h + DWORD 9271813bh, 9271813bh, 9271813bh, 9271813bh + DWORD 53b76b33h, 53b76b33h, 53b76b33h, 53b76b33h + DWORD 0fffe89e0h, 0fffe89e0h, 0fffe89e0h, 0fffe89e0h + DWORD 0ffd6b599h, 0ffd6b599h, 0ffd6b599h, 0ffd6b599h + DWORD 613a89e0h, 613a89e0h, 613a89e0h, 613a89e0h + DWORD 6e09d599h, 6e09d599h, 6e09d599h, 6e09d599h + DWORD 0fff05f90h, 0fff05f90h, 0fff05f90h, 0fff05f90h + DWORD 0fff05f90h, 0fff05f90h, 0fff05f90h, 0fff05f90h + DWORD 83e25f90h, 83e25f90h, 83e25f90h, 83e25f90h + DWORD 83e25f90h, 83e25f90h, 83e25f90h, 83e25f90h + DWORD 0ffd669a8h, 0ffd669a8h, 0ffd669a8h, 0ffd669a8h + DWORD 0ffd669a8h, 0ffd669a8h, 0ffd669a8h, 0ffd669a8h + DWORD 990b69a8h, 990b69a8h, 990b69a8h, 990b69a8h + DWORD 990b69a8h, 990b69a8h, 990b69a8h, 990b69a8h + DWORD 0ffe421d5h, 0ffe421d5h, 0ffe421d5h, 0ffe421d5h + DWORD 0ffe421d5h, 0ffe421d5h, 0ffe421d5h, 0ffe421d5h + DWORD 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h + DWORD 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h, 0ed9ec1d5h + DWORD 0fffc61bch, 0fffc61bch, 0fffc61bch, 0fffc61bch + DWORD 0fffc61bch, 0fffc61bch, 0fffc61bch, 0fffc61bch + DWORD 9e33e1bch, 9e33e1bch, 9e33e1bch, 9e33e1bch + DWORD 9e33e1bch, 9e33e1bch, 9e33e1bch, 9e33e1bch + DWORD 0007eafdh, 0007eafdh, 0007eafdh, 0007eafdh + DWORD 0007eafdh, 0007eafdh, 0007eafdh, 0007eafdh + DWORD 72e78afdh, 72e78afdh, 72e78afdh, 72e78afdh + DWORD 72e78afdh, 72e78afdh, 72e78afdh, 72e78afdh + DWORD 0027cefeh, 0027cefeh, 0027cefeh, 0027cefeh + DWORD 0027cefeh, 0027cefeh, 0027cefeh, 0027cefeh + DWORD 73078efeh, 73078efeh, 73078efeh, 73078efeh + DWORD 73078efeh, 73078efeh, 73078efeh, 73078efeh + DWORD 0ffff9b09h, 0ffff9b09h, 0ffff9b09h, 0ffff9b09h + DWORD 0ffff9b09h, 0ffff9b09h, 0ffff9b09h, 0ffff9b09h + DWORD 92e0bb09h, 92e0bb09h, 92e0bb09h, 92e0bb09h + DWORD 92e0bb09h, 92e0bb09h, 92e0bb09h, 92e0bb09h + DWORD 0000a3fah, 0000a3fah, 0000a3fah, 0000a3fah + DWORD 0000a3fah, 0000a3fah, 0000a3fah, 0000a3fah + DWORD 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah + DWORD 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah, 0ff7fe3fah +ptr_L_mldsa_avx2_zetas_inv QWORD L_mldsa_avx2_zetas_inv +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_poly_red_avx2 PROC + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vpxor ymm10, ymm10, ymm10 + vmovdqu ymm10, YMMWORD PTR mldsa_q + vmovdqu ymm11, YMMWORD PTR mldsa_v + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpaddd ymm8, ymm0, ymm11 + vpaddd ymm9, ymm1, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpaddd ymm8, ymm2, ymm11 + vpaddd ymm9, ymm3, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm2, ymm2, ymm8 + vpsubd ymm3, ymm3, ymm9 + vpaddd ymm8, ymm4, ymm11 + vpaddd ymm9, ymm5, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm4, ymm4, ymm8 + vpsubd ymm5, ymm5, ymm9 + vpaddd ymm8, ymm6, ymm11 + vpaddd ymm9, ymm7, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm6, ymm6, ymm8 + vpsubd ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vpaddd ymm8, ymm0, ymm11 + vpaddd ymm9, ymm1, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpaddd ymm8, ymm2, ymm11 + vpaddd ymm9, ymm3, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm2, ymm2, ymm8 + vpsubd ymm3, ymm3, ymm9 + vpaddd ymm8, ymm4, ymm11 + vpaddd ymm9, ymm5, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm4, ymm4, ymm8 + vpsubd ymm5, ymm5, ymm9 + vpaddd ymm8, ymm6, ymm11 + vpaddd ymm9, ymm7, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm6, ymm6, ymm8 + vpsubd ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + vpaddd ymm8, ymm0, ymm11 + vpaddd ymm9, ymm1, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpaddd ymm8, ymm2, ymm11 + vpaddd ymm9, ymm3, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm2, ymm2, ymm8 + vpsubd ymm3, ymm3, ymm9 + vpaddd ymm8, ymm4, ymm11 + vpaddd ymm9, ymm5, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm4, ymm4, ymm8 + vpsubd ymm5, ymm5, ymm9 + vpaddd ymm8, ymm6, ymm11 + vpaddd ymm9, ymm7, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm6, ymm6, ymm8 + vpsubd ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+704], ymm6 + vmovdqu YMMWORD PTR [rcx+736], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vpaddd ymm8, ymm0, ymm11 + vpaddd ymm9, ymm1, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpaddd ymm8, ymm2, ymm11 + vpaddd ymm9, ymm3, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm2, ymm2, ymm8 + vpsubd ymm3, ymm3, ymm9 + vpaddd ymm8, ymm4, ymm11 + vpaddd ymm9, ymm5, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm4, ymm4, ymm8 + vpsubd ymm5, ymm5, ymm9 + vpaddd ymm8, ymm6, ymm11 + vpaddd ymm9, ymm7, ymm11 + vpsrad ymm8, ymm8, 23 + vpsrad ymm9, ymm9, 23 + vpmulld ymm8, ymm8, ymm10 + vpmulld ymm9, ymm9, ymm10 + vpsubd ymm6, ymm6, ymm8 + vpsubd ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+864], ymm3 + vmovdqu YMMWORD PTR [rcx+896], ymm4 + vmovdqu YMMWORD PTR [rcx+928], ymm5 + vmovdqu YMMWORD PTR [rcx+960], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + ret +wc_mldsa_poly_red_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_ntt_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vpxor ymm14, ymm14, ymm14 + vmovdqu ymm14, YMMWORD PTR mldsa_q + ; ntt + mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas] + vmovdqu ymm11, YMMWORD PTR [rdx+64] + vmovdqu ymm13, YMMWORD PTR [rdx+96] + ; 128: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx+96] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vmovdqu ymm2, YMMWORD PTR [rcx+352] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vmovdqu ymm4, YMMWORD PTR [rcx+608] + vmovdqu ymm5, YMMWORD PTR [rcx+736] + vmovdqu ymm6, YMMWORD PTR [rcx+864] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+224], ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + vmovdqu YMMWORD PTR [rcx+608], ymm4 + vmovdqu YMMWORD PTR [rcx+736], ymm5 + vmovdqu YMMWORD PTR [rcx+864], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + ; 128: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+192] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+448] + vmovdqu ymm4, YMMWORD PTR [rcx+576] + vmovdqu ymm5, YMMWORD PTR [rcx+704] + vmovdqu ymm6, YMMWORD PTR [rcx+832] + vmovdqu ymm7, YMMWORD PTR [rcx+960] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+192], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+448], ymm3 + vmovdqu YMMWORD PTR [rcx+576], ymm4 + vmovdqu YMMWORD PTR [rcx+704], ymm5 + vmovdqu YMMWORD PTR [rcx+832], ymm6 + vmovdqu YMMWORD PTR [rcx+960], ymm7 + ; 128: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx+32] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+288] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vmovdqu ymm4, YMMWORD PTR [rcx+544] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+800] + vmovdqu ymm7, YMMWORD PTR [rcx+928] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm2 + vmovdqu YMMWORD PTR [rcx+416], ymm3 + vmovdqu YMMWORD PTR [rcx+544], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+800], ymm6 + vmovdqu YMMWORD PTR [rcx+928], ymm7 + ; 128: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+128] + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+384] + vmovdqu ymm4, YMMWORD PTR [rcx+512] + vmovdqu ymm5, YMMWORD PTR [rcx+640] + vmovdqu ymm6, YMMWORD PTR [rcx+768] + vmovdqu ymm7, YMMWORD PTR [rcx+896] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+256], ymm2 + vmovdqu YMMWORD PTR [rcx+384], ymm3 + vmovdqu YMMWORD PTR [rcx+512], ymm4 + vmovdqu YMMWORD PTR [rcx+640], ymm5 + vmovdqu YMMWORD PTR [rcx+768], ymm6 + vmovdqu YMMWORD PTR [rcx+896], ymm7 + vmovdqu ymm4, ymm1 + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + ; 32: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+192] + vmovdqu ymm12, YMMWORD PTR [rdx+224] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+256] + vmovdqu ymm12, YMMWORD PTR [rdx+288] + vmovdqu ymm11, YMMWORD PTR [rdx+320] + vmovdqu ymm13, YMMWORD PTR [rdx+352] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+384] + vmovdqu ymm12, YMMWORD PTR [rdx+416] + vmovdqu ymm11, YMMWORD PTR [rdx+448] + vmovdqu ymm13, YMMWORD PTR [rdx+480] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+512] + vmovdqu ymm12, YMMWORD PTR [rdx+544] + vmovdqu ymm11, YMMWORD PTR [rdx+576] + vmovdqu ymm13, YMMWORD PTR [rdx+608] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 1/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+640] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+672] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+704] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+736] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+768] + vmovdqu ymm12, YMMWORD PTR [rdx+800] + vmovdqu ymm11, YMMWORD PTR [rdx+832] + vmovdqu ymm13, YMMWORD PTR [rdx+864] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 1/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+896] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+928] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+960] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+992] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1024] + vmovdqu ymm12, YMMWORD PTR [rdx+1056] + vmovdqu ymm11, YMMWORD PTR [rdx+1088] + vmovdqu ymm13, YMMWORD PTR [rdx+1120] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1152] + vmovdqu ymm12, YMMWORD PTR [rdx+1184] + vmovdqu ymm11, YMMWORD PTR [rdx+1216] + vmovdqu ymm13, YMMWORD PTR [rdx+1248] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1280] + vmovdqu ymm12, YMMWORD PTR [rdx+1312] + vmovdqu ymm11, YMMWORD PTR [rdx+1344] + vmovdqu ymm13, YMMWORD PTR [rdx+1376] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + ; 32: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1408] + vmovdqu ymm12, YMMWORD PTR [rdx+1440] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1472] + vmovdqu ymm12, YMMWORD PTR [rdx+1504] + vmovdqu ymm11, YMMWORD PTR [rdx+1536] + vmovdqu ymm13, YMMWORD PTR [rdx+1568] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1600] + vmovdqu ymm12, YMMWORD PTR [rdx+1632] + vmovdqu ymm11, YMMWORD PTR [rdx+1664] + vmovdqu ymm13, YMMWORD PTR [rdx+1696] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+1728] + vmovdqu ymm12, YMMWORD PTR [rdx+1760] + vmovdqu ymm11, YMMWORD PTR [rdx+1792] + vmovdqu ymm13, YMMWORD PTR [rdx+1824] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 2/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+1856] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+1888] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+1920] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+1952] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1984] + vmovdqu ymm12, YMMWORD PTR [rdx+2016] + vmovdqu ymm11, YMMWORD PTR [rdx+2048] + vmovdqu ymm13, YMMWORD PTR [rdx+2080] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 2/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+2112] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+2144] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+2176] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+2208] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2240] + vmovdqu ymm12, YMMWORD PTR [rdx+2272] + vmovdqu ymm11, YMMWORD PTR [rdx+2304] + vmovdqu ymm13, YMMWORD PTR [rdx+2336] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2368] + vmovdqu ymm12, YMMWORD PTR [rdx+2400] + vmovdqu ymm11, YMMWORD PTR [rdx+2432] + vmovdqu ymm13, YMMWORD PTR [rdx+2464] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2496] + vmovdqu ymm12, YMMWORD PTR [rdx+2528] + vmovdqu ymm11, YMMWORD PTR [rdx+2560] + vmovdqu ymm13, YMMWORD PTR [rdx+2592] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + ; 32: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2624] + vmovdqu ymm12, YMMWORD PTR [rdx+2656] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2688] + vmovdqu ymm12, YMMWORD PTR [rdx+2720] + vmovdqu ymm11, YMMWORD PTR [rdx+2752] + vmovdqu ymm13, YMMWORD PTR [rdx+2784] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2816] + vmovdqu ymm12, YMMWORD PTR [rdx+2848] + vmovdqu ymm11, YMMWORD PTR [rdx+2880] + vmovdqu ymm13, YMMWORD PTR [rdx+2912] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+2944] + vmovdqu ymm12, YMMWORD PTR [rdx+2976] + vmovdqu ymm11, YMMWORD PTR [rdx+3008] + vmovdqu ymm13, YMMWORD PTR [rdx+3040] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 3/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+3072] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+3104] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+3136] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+3168] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3200] + vmovdqu ymm12, YMMWORD PTR [rdx+3232] + vmovdqu ymm11, YMMWORD PTR [rdx+3264] + vmovdqu ymm13, YMMWORD PTR [rdx+3296] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 3/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+3328] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+3360] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+3392] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+3424] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3456] + vmovdqu ymm12, YMMWORD PTR [rdx+3488] + vmovdqu ymm11, YMMWORD PTR [rdx+3520] + vmovdqu ymm13, YMMWORD PTR [rdx+3552] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3584] + vmovdqu ymm12, YMMWORD PTR [rdx+3616] + vmovdqu ymm11, YMMWORD PTR [rdx+3648] + vmovdqu ymm13, YMMWORD PTR [rdx+3680] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3712] + vmovdqu ymm12, YMMWORD PTR [rdx+3744] + vmovdqu ymm11, YMMWORD PTR [rdx+3776] + vmovdqu ymm13, YMMWORD PTR [rdx+3808] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+704], ymm6 + vmovdqu YMMWORD PTR [rcx+736], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + ; 32: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3840] + vmovdqu ymm12, YMMWORD PTR [rdx+3872] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3904] + vmovdqu ymm12, YMMWORD PTR [rdx+3936] + vmovdqu ymm11, YMMWORD PTR [rdx+3968] + vmovdqu ymm13, YMMWORD PTR [rdx+4000] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4032] + vmovdqu ymm12, YMMWORD PTR [rdx+4064] + vmovdqu ymm11, YMMWORD PTR [rdx+4096] + vmovdqu ymm13, YMMWORD PTR [rdx+4128] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+4160] + vmovdqu ymm12, YMMWORD PTR [rdx+4192] + vmovdqu ymm11, YMMWORD PTR [rdx+4224] + vmovdqu ymm13, YMMWORD PTR [rdx+4256] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 4/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4288] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4320] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4352] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4384] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4416] + vmovdqu ymm12, YMMWORD PTR [rdx+4448] + vmovdqu ymm11, YMMWORD PTR [rdx+4480] + vmovdqu ymm13, YMMWORD PTR [rdx+4512] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 4/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4544] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4576] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4608] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4640] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4672] + vmovdqu ymm12, YMMWORD PTR [rdx+4704] + vmovdqu ymm11, YMMWORD PTR [rdx+4736] + vmovdqu ymm13, YMMWORD PTR [rdx+4768] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4800] + vmovdqu ymm12, YMMWORD PTR [rdx+4832] + vmovdqu ymm11, YMMWORD PTR [rdx+4864] + vmovdqu ymm13, YMMWORD PTR [rdx+4896] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4928] + vmovdqu ymm12, YMMWORD PTR [rdx+4960] + vmovdqu ymm11, YMMWORD PTR [rdx+4992] + vmovdqu ymm13, YMMWORD PTR [rdx+5024] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+864], ymm3 + vmovdqu YMMWORD PTR [rcx+896], ymm4 + vmovdqu YMMWORD PTR [rcx+928], ymm5 + vmovdqu YMMWORD PTR [rcx+960], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_ntt_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_ntt_full_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vpxor ymm14, ymm14, ymm14 + vmovdqu ymm14, YMMWORD PTR mldsa_q + ; ntt + mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas] + vmovdqu ymm11, YMMWORD PTR [rdx+64] + vmovdqu ymm13, YMMWORD PTR [rdx+96] + ; 128: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx+96] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vmovdqu ymm2, YMMWORD PTR [rcx+352] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vmovdqu ymm4, YMMWORD PTR [rcx+608] + vmovdqu ymm5, YMMWORD PTR [rcx+736] + vmovdqu ymm6, YMMWORD PTR [rcx+864] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+224], ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + vmovdqu YMMWORD PTR [rcx+608], ymm4 + vmovdqu YMMWORD PTR [rcx+736], ymm5 + vmovdqu YMMWORD PTR [rcx+864], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + ; 128: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+192] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+448] + vmovdqu ymm4, YMMWORD PTR [rcx+576] + vmovdqu ymm5, YMMWORD PTR [rcx+704] + vmovdqu ymm6, YMMWORD PTR [rcx+832] + vmovdqu ymm7, YMMWORD PTR [rcx+960] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+192], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+448], ymm3 + vmovdqu YMMWORD PTR [rcx+576], ymm4 + vmovdqu YMMWORD PTR [rcx+704], ymm5 + vmovdqu YMMWORD PTR [rcx+832], ymm6 + vmovdqu YMMWORD PTR [rcx+960], ymm7 + ; 128: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx+32] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+288] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vmovdqu ymm4, YMMWORD PTR [rcx+544] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+800] + vmovdqu ymm7, YMMWORD PTR [rcx+928] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm2 + vmovdqu YMMWORD PTR [rcx+416], ymm3 + vmovdqu YMMWORD PTR [rcx+544], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+800], ymm6 + vmovdqu YMMWORD PTR [rcx+928], ymm7 + ; 128: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+128] + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+384] + vmovdqu ymm4, YMMWORD PTR [rcx+512] + vmovdqu ymm5, YMMWORD PTR [rcx+640] + vmovdqu ymm6, YMMWORD PTR [rcx+768] + vmovdqu ymm7, YMMWORD PTR [rcx+896] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 64: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vpmulld ymm8, ymm2, ymm13 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + vmovdqu YMMWORD PTR [rcx+256], ymm2 + vmovdqu YMMWORD PTR [rcx+384], ymm3 + vmovdqu YMMWORD PTR [rcx+512], ymm4 + vmovdqu YMMWORD PTR [rcx+640], ymm5 + vmovdqu YMMWORD PTR [rcx+768], ymm6 + vmovdqu YMMWORD PTR [rcx+896], ymm7 + vmovdqu ymm4, ymm1 + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + ; 32: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+192] + vmovdqu ymm12, YMMWORD PTR [rdx+224] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+256] + vmovdqu ymm12, YMMWORD PTR [rdx+288] + vmovdqu ymm11, YMMWORD PTR [rdx+320] + vmovdqu ymm13, YMMWORD PTR [rdx+352] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+384] + vmovdqu ymm12, YMMWORD PTR [rdx+416] + vmovdqu ymm11, YMMWORD PTR [rdx+448] + vmovdqu ymm13, YMMWORD PTR [rdx+480] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+512] + vmovdqu ymm12, YMMWORD PTR [rdx+544] + vmovdqu ymm11, YMMWORD PTR [rdx+576] + vmovdqu ymm13, YMMWORD PTR [rdx+608] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 1/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+640] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+672] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+704] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+736] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+768] + vmovdqu ymm12, YMMWORD PTR [rdx+800] + vmovdqu ymm11, YMMWORD PTR [rdx+832] + vmovdqu ymm13, YMMWORD PTR [rdx+864] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 1/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+896] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+928] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+960] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+992] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1024] + vmovdqu ymm12, YMMWORD PTR [rdx+1056] + vmovdqu ymm11, YMMWORD PTR [rdx+1088] + vmovdqu ymm13, YMMWORD PTR [rdx+1120] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1152] + vmovdqu ymm12, YMMWORD PTR [rdx+1184] + vmovdqu ymm11, YMMWORD PTR [rdx+1216] + vmovdqu ymm13, YMMWORD PTR [rdx+1248] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1280] + vmovdqu ymm12, YMMWORD PTR [rdx+1312] + vmovdqu ymm11, YMMWORD PTR [rdx+1344] + vmovdqu ymm13, YMMWORD PTR [rdx+1376] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + ; 32: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1408] + vmovdqu ymm12, YMMWORD PTR [rdx+1440] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1472] + vmovdqu ymm12, YMMWORD PTR [rdx+1504] + vmovdqu ymm11, YMMWORD PTR [rdx+1536] + vmovdqu ymm13, YMMWORD PTR [rdx+1568] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1600] + vmovdqu ymm12, YMMWORD PTR [rdx+1632] + vmovdqu ymm11, YMMWORD PTR [rdx+1664] + vmovdqu ymm13, YMMWORD PTR [rdx+1696] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+1728] + vmovdqu ymm12, YMMWORD PTR [rdx+1760] + vmovdqu ymm11, YMMWORD PTR [rdx+1792] + vmovdqu ymm13, YMMWORD PTR [rdx+1824] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 2/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+1856] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+1888] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+1920] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+1952] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1984] + vmovdqu ymm12, YMMWORD PTR [rdx+2016] + vmovdqu ymm11, YMMWORD PTR [rdx+2048] + vmovdqu ymm13, YMMWORD PTR [rdx+2080] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 2/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+2112] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+2144] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+2176] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+2208] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2240] + vmovdqu ymm12, YMMWORD PTR [rdx+2272] + vmovdqu ymm11, YMMWORD PTR [rdx+2304] + vmovdqu ymm13, YMMWORD PTR [rdx+2336] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2368] + vmovdqu ymm12, YMMWORD PTR [rdx+2400] + vmovdqu ymm11, YMMWORD PTR [rdx+2432] + vmovdqu ymm13, YMMWORD PTR [rdx+2464] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2496] + vmovdqu ymm12, YMMWORD PTR [rdx+2528] + vmovdqu ymm11, YMMWORD PTR [rdx+2560] + vmovdqu ymm13, YMMWORD PTR [rdx+2592] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + ; 32: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2624] + vmovdqu ymm12, YMMWORD PTR [rdx+2656] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2688] + vmovdqu ymm12, YMMWORD PTR [rdx+2720] + vmovdqu ymm11, YMMWORD PTR [rdx+2752] + vmovdqu ymm13, YMMWORD PTR [rdx+2784] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2816] + vmovdqu ymm12, YMMWORD PTR [rdx+2848] + vmovdqu ymm11, YMMWORD PTR [rdx+2880] + vmovdqu ymm13, YMMWORD PTR [rdx+2912] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+2944] + vmovdqu ymm12, YMMWORD PTR [rdx+2976] + vmovdqu ymm11, YMMWORD PTR [rdx+3008] + vmovdqu ymm13, YMMWORD PTR [rdx+3040] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 3/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+3072] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+3104] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+3136] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+3168] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3200] + vmovdqu ymm12, YMMWORD PTR [rdx+3232] + vmovdqu ymm11, YMMWORD PTR [rdx+3264] + vmovdqu ymm13, YMMWORD PTR [rdx+3296] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 3/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+3328] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+3360] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+3392] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+3424] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3456] + vmovdqu ymm12, YMMWORD PTR [rdx+3488] + vmovdqu ymm11, YMMWORD PTR [rdx+3520] + vmovdqu ymm13, YMMWORD PTR [rdx+3552] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3584] + vmovdqu ymm12, YMMWORD PTR [rdx+3616] + vmovdqu ymm11, YMMWORD PTR [rdx+3648] + vmovdqu ymm13, YMMWORD PTR [rdx+3680] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3712] + vmovdqu ymm12, YMMWORD PTR [rdx+3744] + vmovdqu ymm11, YMMWORD PTR [rdx+3776] + vmovdqu ymm13, YMMWORD PTR [rdx+3808] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+704], ymm6 + vmovdqu YMMWORD PTR [rcx+736], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + ; 32: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3840] + vmovdqu ymm12, YMMWORD PTR [rdx+3872] + vpmulld ymm8, ymm4, ymm12 + vmovshdup ymm9, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm4, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm12 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vpmulld ymm8, ymm7, ymm12 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm3, ymm8 + vpaddd ymm3, ymm3, ymm8 + ; 16: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3904] + vmovdqu ymm12, YMMWORD PTR [rdx+3936] + vmovdqu ymm11, YMMWORD PTR [rdx+3968] + vmovdqu ymm13, YMMWORD PTR [rdx+4000] + vpmulld ymm8, ymm2, ymm12 + vmovshdup ymm9, ymm2 + vpmuldq ymm2, ymm2, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm2, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm12 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm1, ymm8 + vpaddd ymm1, ymm1, ymm8 + vpmulld ymm8, ymm6, ymm13 + vmovshdup ymm9, ymm6 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm6, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm5, ymm8 + vpaddd ymm5, ymm5, ymm8 + ; 8: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4032] + vmovdqu ymm12, YMMWORD PTR [rdx+4064] + vmovdqu ymm11, YMMWORD PTR [rdx+4096] + vmovdqu ymm13, YMMWORD PTR [rdx+4128] + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + vmovdqu ymm10, YMMWORD PTR [rdx+4160] + vmovdqu ymm12, YMMWORD PTR [rdx+4192] + vmovdqu ymm11, YMMWORD PTR [rdx+4224] + vmovdqu ymm13, YMMWORD PTR [rdx+4256] + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 4: 4/4 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4288] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4320] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4352] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4384] + vpmulld ymm0, ymm1, ymm12 + vmovshdup ymm2, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm1, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm1, ymm8, ymm0 + vpaddd ymm8, ymm8, ymm0 + vpmulld ymm0, ymm3, ymm13 + vmovshdup ymm2, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm0 + vpmuldq ymm0, ymm0, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm0, ymm3, ymm0 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm0, ymm0 + vpblendd ymm0, ymm0, ymm15, 170 + vpsubd ymm3, ymm9, ymm0 + vpaddd ymm9, ymm9, ymm0 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4416] + vmovdqu ymm12, YMMWORD PTR [rdx+4448] + vmovdqu ymm11, YMMWORD PTR [rdx+4480] + vmovdqu ymm13, YMMWORD PTR [rdx+4512] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 4: 4/4 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4544] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4576] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4608] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4640] + vpmulld ymm4, ymm5, ymm12 + vmovshdup ymm6, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm5, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm5, ymm8, ymm4 + vpaddd ymm8, ymm8, ymm4 + vpmulld ymm4, ymm7, ymm13 + vmovshdup ymm6, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm7, ymm4 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm7, ymm9, ymm4 + vpaddd ymm9, ymm9, ymm4 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4672] + vmovdqu ymm12, YMMWORD PTR [rdx+4704] + vmovdqu ymm11, YMMWORD PTR [rdx+4736] + vmovdqu ymm13, YMMWORD PTR [rdx+4768] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4800] + vmovdqu ymm12, YMMWORD PTR [rdx+4832] + vmovdqu ymm11, YMMWORD PTR [rdx+4864] + vmovdqu ymm13, YMMWORD PTR [rdx+4896] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmulld ymm8, ymm1, ymm12 + vmovshdup ymm9, ymm1 + vpmuldq ymm1, ymm1, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm1, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm1, ymm0, ymm8 + vpaddd ymm0, ymm0, ymm8 + vpmulld ymm8, ymm3, ymm13 + vmovshdup ymm9, ymm3 + vpmuldq ymm3, ymm3, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm3, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm3, ymm2, ymm8 + vpaddd ymm2, ymm2, ymm8 + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4928] + vmovdqu ymm12, YMMWORD PTR [rdx+4960] + vmovdqu ymm11, YMMWORD PTR [rdx+4992] + vmovdqu ymm13, YMMWORD PTR [rdx+5024] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmulld ymm8, ymm5, ymm12 + vmovshdup ymm9, ymm5 + vpmuldq ymm5, ymm5, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm5, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm5, ymm4, ymm8 + vpaddd ymm4, ymm4, ymm8 + vpmulld ymm8, ymm7, ymm13 + vmovshdup ymm9, ymm7 + vpmuldq ymm7, ymm7, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm7, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm8, ymm8 + vpblendd ymm8, ymm8, ymm15, 170 + vpsubd ymm7, ymm6, ymm8 + vpaddd ymm6, ymm6, ymm8 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+864], ymm3 + vmovdqu YMMWORD PTR [rcx+896], ymm4 + vmovdqu YMMWORD PTR [rcx+928], ymm5 + vmovdqu YMMWORD PTR [rcx+960], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_ntt_full_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_invntt_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vpxor ymm14, ymm14, ymm14 + vmovdqu ymm14, YMMWORD PTR mldsa_q + ; invntt + mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas_inv] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm11, YMMWORD PTR [rdx+64] + vmovdqu ymm13, YMMWORD PTR [rdx+96] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vmovdqu ymm11, YMMWORD PTR [rdx+192] + vmovdqu ymm13, YMMWORD PTR [rdx+224] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+256] + vmovdqu ymm12, YMMWORD PTR [rdx+288] + vmovdqu ymm11, YMMWORD PTR [rdx+320] + vmovdqu ymm13, YMMWORD PTR [rdx+352] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 1/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+384] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+416] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+448] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+480] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+512] + vmovdqu ymm12, YMMWORD PTR [rdx+544] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+576] + vmovdqu ymm12, YMMWORD PTR [rdx+608] + vmovdqu ymm11, YMMWORD PTR [rdx+640] + vmovdqu ymm13, YMMWORD PTR [rdx+672] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+704] + vmovdqu ymm12, YMMWORD PTR [rdx+736] + vmovdqu ymm11, YMMWORD PTR [rdx+768] + vmovdqu ymm13, YMMWORD PTR [rdx+800] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+832] + vmovdqu ymm12, YMMWORD PTR [rdx+864] + vmovdqu ymm11, YMMWORD PTR [rdx+896] + vmovdqu ymm13, YMMWORD PTR [rdx+928] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 1/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+960] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+992] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+1024] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+1056] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1088] + vmovdqu ymm12, YMMWORD PTR [rdx+1120] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1152] + vmovdqu ymm12, YMMWORD PTR [rdx+1184] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1216] + vmovdqu ymm12, YMMWORD PTR [rdx+1248] + vmovdqu ymm11, YMMWORD PTR [rdx+1280] + vmovdqu ymm13, YMMWORD PTR [rdx+1312] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1344] + vmovdqu ymm12, YMMWORD PTR [rdx+1376] + vmovdqu ymm11, YMMWORD PTR [rdx+1408] + vmovdqu ymm13, YMMWORD PTR [rdx+1440] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1472] + vmovdqu ymm12, YMMWORD PTR [rdx+1504] + vmovdqu ymm11, YMMWORD PTR [rdx+1536] + vmovdqu ymm13, YMMWORD PTR [rdx+1568] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 2/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+1600] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+1632] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+1664] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+1696] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1728] + vmovdqu ymm12, YMMWORD PTR [rdx+1760] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1792] + vmovdqu ymm12, YMMWORD PTR [rdx+1824] + vmovdqu ymm11, YMMWORD PTR [rdx+1856] + vmovdqu ymm13, YMMWORD PTR [rdx+1888] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1920] + vmovdqu ymm12, YMMWORD PTR [rdx+1952] + vmovdqu ymm11, YMMWORD PTR [rdx+1984] + vmovdqu ymm13, YMMWORD PTR [rdx+2016] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2048] + vmovdqu ymm12, YMMWORD PTR [rdx+2080] + vmovdqu ymm11, YMMWORD PTR [rdx+2112] + vmovdqu ymm13, YMMWORD PTR [rdx+2144] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 2/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+2176] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+2208] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+2240] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+2272] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2304] + vmovdqu ymm12, YMMWORD PTR [rdx+2336] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2368] + vmovdqu ymm12, YMMWORD PTR [rdx+2400] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2432] + vmovdqu ymm12, YMMWORD PTR [rdx+2464] + vmovdqu ymm11, YMMWORD PTR [rdx+2496] + vmovdqu ymm13, YMMWORD PTR [rdx+2528] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2560] + vmovdqu ymm12, YMMWORD PTR [rdx+2592] + vmovdqu ymm11, YMMWORD PTR [rdx+2624] + vmovdqu ymm13, YMMWORD PTR [rdx+2656] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2688] + vmovdqu ymm12, YMMWORD PTR [rdx+2720] + vmovdqu ymm11, YMMWORD PTR [rdx+2752] + vmovdqu ymm13, YMMWORD PTR [rdx+2784] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 3/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+2816] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+2848] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+2880] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+2912] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2944] + vmovdqu ymm12, YMMWORD PTR [rdx+2976] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3008] + vmovdqu ymm12, YMMWORD PTR [rdx+3040] + vmovdqu ymm11, YMMWORD PTR [rdx+3072] + vmovdqu ymm13, YMMWORD PTR [rdx+3104] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3136] + vmovdqu ymm12, YMMWORD PTR [rdx+3168] + vmovdqu ymm11, YMMWORD PTR [rdx+3200] + vmovdqu ymm13, YMMWORD PTR [rdx+3232] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3264] + vmovdqu ymm12, YMMWORD PTR [rdx+3296] + vmovdqu ymm11, YMMWORD PTR [rdx+3328] + vmovdqu ymm13, YMMWORD PTR [rdx+3360] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 3/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+3392] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+3424] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+3456] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+3488] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3520] + vmovdqu ymm12, YMMWORD PTR [rdx+3552] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3584] + vmovdqu ymm12, YMMWORD PTR [rdx+3616] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+704], ymm6 + vmovdqu YMMWORD PTR [rcx+736], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3648] + vmovdqu ymm12, YMMWORD PTR [rdx+3680] + vmovdqu ymm11, YMMWORD PTR [rdx+3712] + vmovdqu ymm13, YMMWORD PTR [rdx+3744] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3776] + vmovdqu ymm12, YMMWORD PTR [rdx+3808] + vmovdqu ymm11, YMMWORD PTR [rdx+3840] + vmovdqu ymm13, YMMWORD PTR [rdx+3872] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3904] + vmovdqu ymm12, YMMWORD PTR [rdx+3936] + vmovdqu ymm11, YMMWORD PTR [rdx+3968] + vmovdqu ymm13, YMMWORD PTR [rdx+4000] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 4/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4032] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4064] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4096] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4128] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4160] + vmovdqu ymm12, YMMWORD PTR [rdx+4192] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4224] + vmovdqu ymm12, YMMWORD PTR [rdx+4256] + vmovdqu ymm11, YMMWORD PTR [rdx+4288] + vmovdqu ymm13, YMMWORD PTR [rdx+4320] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4352] + vmovdqu ymm12, YMMWORD PTR [rdx+4384] + vmovdqu ymm11, YMMWORD PTR [rdx+4416] + vmovdqu ymm13, YMMWORD PTR [rdx+4448] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4480] + vmovdqu ymm12, YMMWORD PTR [rdx+4512] + vmovdqu ymm11, YMMWORD PTR [rdx+4544] + vmovdqu ymm13, YMMWORD PTR [rdx+4576] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 4/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4608] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4640] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4672] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4704] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4736] + vmovdqu ymm12, YMMWORD PTR [rdx+4768] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4800] + vmovdqu ymm12, YMMWORD PTR [rdx+4832] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+896], ymm4 + vmovdqu YMMWORD PTR [rcx+928], ymm5 + vmovdqu YMMWORD PTR [rcx+960], ymm6 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm6, ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+96] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vmovdqu ymm2, YMMWORD PTR [rcx+352] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vmovdqu ymm4, YMMWORD PTR [rcx+608] + vmovdqu ymm5, YMMWORD PTR [rcx+736] + ; 64: 4/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 4/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+224], ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + vmovdqu YMMWORD PTR [rcx+608], ymm4 + vmovdqu YMMWORD PTR [rcx+736], ymm5 + vmovdqu YMMWORD PTR [rcx+864], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+192] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+448] + vmovdqu ymm4, YMMWORD PTR [rcx+576] + vmovdqu ymm5, YMMWORD PTR [rcx+704] + vmovdqu ymm6, YMMWORD PTR [rcx+832] + vmovdqu ymm7, YMMWORD PTR [rcx+960] + ; 64: 3/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 3/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+192], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+448], ymm3 + vmovdqu YMMWORD PTR [rcx+576], ymm4 + vmovdqu YMMWORD PTR [rcx+704], ymm5 + vmovdqu YMMWORD PTR [rcx+832], ymm6 + vmovdqu YMMWORD PTR [rcx+960], ymm7 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm0, YMMWORD PTR [rcx+32] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+288] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vmovdqu ymm4, YMMWORD PTR [rcx+544] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+800] + vmovdqu ymm7, YMMWORD PTR [rcx+928] + ; 64: 2/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 2/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm2 + vmovdqu YMMWORD PTR [rcx+416], ymm3 + vmovdqu YMMWORD PTR [rcx+544], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+800], ymm6 + vmovdqu YMMWORD PTR [rcx+928], ymm7 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+128] + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+384] + vmovdqu ymm4, YMMWORD PTR [rcx+512] + vmovdqu ymm5, YMMWORD PTR [rcx+640] + vmovdqu ymm6, YMMWORD PTR [rcx+768] + vmovdqu ymm7, YMMWORD PTR [rcx+896] + ; 64: 1/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 1/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+256], ymm2 + vmovdqu YMMWORD PTR [rcx+384], ymm3 + vmovdqu YMMWORD PTR [rcx+512], ymm4 + vmovdqu YMMWORD PTR [rcx+640], ymm5 + vmovdqu YMMWORD PTR [rcx+768], ymm6 + vmovdqu YMMWORD PTR [rcx+896], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_invntt_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_invntt_full_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vpxor ymm14, ymm14, ymm14 + vmovdqu ymm14, YMMWORD PTR mldsa_q + ; invntt + mov rdx, QWORD PTR [ptr_L_mldsa_avx2_zetas_inv] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR [rdx+32] + vmovdqu ymm11, YMMWORD PTR [rdx+64] + vmovdqu ymm13, YMMWORD PTR [rdx+96] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+128] + vmovdqu ymm12, YMMWORD PTR [rdx+160] + vmovdqu ymm11, YMMWORD PTR [rdx+192] + vmovdqu ymm13, YMMWORD PTR [rdx+224] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+256] + vmovdqu ymm12, YMMWORD PTR [rdx+288] + vmovdqu ymm11, YMMWORD PTR [rdx+320] + vmovdqu ymm13, YMMWORD PTR [rdx+352] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 1/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+384] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+416] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+448] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+480] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+512] + vmovdqu ymm12, YMMWORD PTR [rdx+544] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpermq ymm4, ymm4, 216 + vpermq ymm5, ymm5, 216 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpermq ymm6, ymm6, 216 + vpermq ymm7, ymm7, 216 + ; 1: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+576] + vmovdqu ymm12, YMMWORD PTR [rdx+608] + vmovdqu ymm11, YMMWORD PTR [rdx+640] + vmovdqu ymm13, YMMWORD PTR [rdx+672] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+704] + vmovdqu ymm12, YMMWORD PTR [rdx+736] + vmovdqu ymm11, YMMWORD PTR [rdx+768] + vmovdqu ymm13, YMMWORD PTR [rdx+800] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+832] + vmovdqu ymm12, YMMWORD PTR [rdx+864] + vmovdqu ymm11, YMMWORD PTR [rdx+896] + vmovdqu ymm13, YMMWORD PTR [rdx+928] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 1/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+960] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+992] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+1024] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+1056] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1088] + vmovdqu ymm12, YMMWORD PTR [rdx+1120] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 1/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1152] + vmovdqu ymm12, YMMWORD PTR [rdx+1184] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1216] + vmovdqu ymm12, YMMWORD PTR [rdx+1248] + vmovdqu ymm11, YMMWORD PTR [rdx+1280] + vmovdqu ymm13, YMMWORD PTR [rdx+1312] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1344] + vmovdqu ymm12, YMMWORD PTR [rdx+1376] + vmovdqu ymm11, YMMWORD PTR [rdx+1408] + vmovdqu ymm13, YMMWORD PTR [rdx+1440] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1472] + vmovdqu ymm12, YMMWORD PTR [rdx+1504] + vmovdqu ymm11, YMMWORD PTR [rdx+1536] + vmovdqu ymm13, YMMWORD PTR [rdx+1568] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 2/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+1600] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+1632] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+1664] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+1696] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1728] + vmovdqu ymm12, YMMWORD PTR [rdx+1760] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpermq ymm4, ymm4, 216 + vpermq ymm5, ymm5, 216 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpermq ymm6, ymm6, 216 + vpermq ymm7, ymm7, 216 + ; 1: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1792] + vmovdqu ymm12, YMMWORD PTR [rdx+1824] + vmovdqu ymm11, YMMWORD PTR [rdx+1856] + vmovdqu ymm13, YMMWORD PTR [rdx+1888] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+1920] + vmovdqu ymm12, YMMWORD PTR [rdx+1952] + vmovdqu ymm11, YMMWORD PTR [rdx+1984] + vmovdqu ymm13, YMMWORD PTR [rdx+2016] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2048] + vmovdqu ymm12, YMMWORD PTR [rdx+2080] + vmovdqu ymm11, YMMWORD PTR [rdx+2112] + vmovdqu ymm13, YMMWORD PTR [rdx+2144] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 2/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+2176] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+2208] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+2240] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+2272] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2304] + vmovdqu ymm12, YMMWORD PTR [rdx+2336] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 2/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2368] + vmovdqu ymm12, YMMWORD PTR [rdx+2400] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2432] + vmovdqu ymm12, YMMWORD PTR [rdx+2464] + vmovdqu ymm11, YMMWORD PTR [rdx+2496] + vmovdqu ymm13, YMMWORD PTR [rdx+2528] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2560] + vmovdqu ymm12, YMMWORD PTR [rdx+2592] + vmovdqu ymm11, YMMWORD PTR [rdx+2624] + vmovdqu ymm13, YMMWORD PTR [rdx+2656] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2688] + vmovdqu ymm12, YMMWORD PTR [rdx+2720] + vmovdqu ymm11, YMMWORD PTR [rdx+2752] + vmovdqu ymm13, YMMWORD PTR [rdx+2784] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 3/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+2816] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+2848] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+2880] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+2912] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+2944] + vmovdqu ymm12, YMMWORD PTR [rdx+2976] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpermq ymm4, ymm4, 216 + vpermq ymm5, ymm5, 216 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpermq ymm6, ymm6, 216 + vpermq ymm7, ymm7, 216 + ; 1: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3008] + vmovdqu ymm12, YMMWORD PTR [rdx+3040] + vmovdqu ymm11, YMMWORD PTR [rdx+3072] + vmovdqu ymm13, YMMWORD PTR [rdx+3104] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3136] + vmovdqu ymm12, YMMWORD PTR [rdx+3168] + vmovdqu ymm11, YMMWORD PTR [rdx+3200] + vmovdqu ymm13, YMMWORD PTR [rdx+3232] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3264] + vmovdqu ymm12, YMMWORD PTR [rdx+3296] + vmovdqu ymm11, YMMWORD PTR [rdx+3328] + vmovdqu ymm13, YMMWORD PTR [rdx+3360] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 3/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+3392] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+3424] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+3456] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+3488] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3520] + vmovdqu ymm12, YMMWORD PTR [rdx+3552] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 3/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3584] + vmovdqu ymm12, YMMWORD PTR [rdx+3616] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+704], ymm6 + vmovdqu YMMWORD PTR [rcx+736], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3648] + vmovdqu ymm12, YMMWORD PTR [rdx+3680] + vmovdqu ymm11, YMMWORD PTR [rdx+3712] + vmovdqu ymm13, YMMWORD PTR [rdx+3744] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3776] + vmovdqu ymm12, YMMWORD PTR [rdx+3808] + vmovdqu ymm11, YMMWORD PTR [rdx+3840] + vmovdqu ymm13, YMMWORD PTR [rdx+3872] + vpshufd ymm8, ymm0, 216 + vpshufd ymm9, ymm1, 216 + vpunpckldq ymm0, ymm8, ymm9 + vpunpckhdq ymm1, ymm8, ymm9 + vpshufd ymm8, ymm2, 216 + vpshufd ymm9, ymm3, 216 + vpunpckldq ymm2, ymm8, ymm9 + vpunpckhdq ymm3, ymm8, ymm9 + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 4: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+3904] + vmovdqu ymm12, YMMWORD PTR [rdx+3936] + vmovdqu ymm11, YMMWORD PTR [rdx+3968] + vmovdqu ymm13, YMMWORD PTR [rdx+4000] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm1, ymm0, ymm1 + vpunpcklqdq ymm9, ymm2, ymm3 + vpunpckhqdq ymm3, ymm2, ymm3 + vpsubd ymm0, ymm8, ymm1 + vpaddd ymm8, ymm8, ymm1 + vpmulld ymm1, ymm0, ymm12 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm10 + vpmuldq ymm2, ymm2, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm0, ymm1 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm0, ymm9, ymm3 + vpaddd ymm9, ymm9, ymm3 + vpmulld ymm3, ymm0, ymm13 + vmovshdup ymm2, ymm0 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm2, ymm2, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm0, ymm3 + vpsubq ymm15, ymm2, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 8: 4/4 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4032] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4064] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4096] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4128] + vpsubd ymm8, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpmulld ymm1, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm1 + vpmuldq ymm1, ymm1, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm1, ymm8, ymm1 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm1, ymm1 + vpblendd ymm1, ymm1, ymm15, 170 + vpsubd ymm8, ymm2, ymm3 + vpaddd ymm2, ymm2, ymm3 + vpmulld ymm3, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + ; 16: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4160] + vmovdqu ymm12, YMMWORD PTR [rdx+4192] + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpermq ymm4, ymm4, 216 + vpermq ymm5, ymm5, 216 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpermq ymm6, ymm6, 216 + vpermq ymm7, ymm7, 216 + ; 1: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4224] + vmovdqu ymm12, YMMWORD PTR [rdx+4256] + vmovdqu ymm11, YMMWORD PTR [rdx+4288] + vmovdqu ymm13, YMMWORD PTR [rdx+4320] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vmovshdup ymm10, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vmovshdup ymm11, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 2: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4352] + vmovdqu ymm12, YMMWORD PTR [rdx+4384] + vmovdqu ymm11, YMMWORD PTR [rdx+4416] + vmovdqu ymm13, YMMWORD PTR [rdx+4448] + vpshufd ymm8, ymm4, 216 + vpshufd ymm9, ymm5, 216 + vpunpckldq ymm4, ymm8, ymm9 + vpunpckhdq ymm5, ymm8, ymm9 + vpshufd ymm8, ymm6, 216 + vpshufd ymm9, ymm7, 216 + vpunpckldq ymm6, ymm8, ymm9 + vpunpckhdq ymm7, ymm8, ymm9 + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 4: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4480] + vmovdqu ymm12, YMMWORD PTR [rdx+4512] + vmovdqu ymm11, YMMWORD PTR [rdx+4544] + vmovdqu ymm13, YMMWORD PTR [rdx+4576] + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm5, ymm4, ymm5 + vpunpcklqdq ymm9, ymm6, ymm7 + vpunpckhqdq ymm7, ymm6, ymm7 + vpsubd ymm4, ymm8, ymm5 + vpaddd ymm8, ymm8, ymm5 + vpmulld ymm5, ymm4, ymm12 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm6, ymm6, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm4, ymm5 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm4, ymm9, ymm7 + vpaddd ymm9, ymm9, ymm7 + vpmulld ymm7, ymm4, ymm13 + vmovshdup ymm6, ymm4 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm6, ymm6, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm4, ymm7 + vpsubq ymm15, ymm6, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 8: 4/4 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rdx+4608] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rdx+4640] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rdx+4672] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rdx+4704] + vpsubd ymm8, ymm4, ymm5 + vpaddd ymm4, ymm4, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm6, ymm7 + vpaddd ymm6, ymm6, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 16: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4736] + vmovdqu ymm12, YMMWORD PTR [rdx+4768] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 32: 4/4 + vmovdqu ymm10, YMMWORD PTR [rdx+4800] + vmovdqu ymm12, YMMWORD PTR [rdx+4832] + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+896], ymm4 + vmovdqu YMMWORD PTR [rcx+928], ymm5 + vmovdqu YMMWORD PTR [rcx+960], ymm6 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm6, ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+96] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vmovdqu ymm2, YMMWORD PTR [rcx+352] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vmovdqu ymm4, YMMWORD PTR [rcx+608] + vmovdqu ymm5, YMMWORD PTR [rcx+736] + ; 64: 4/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 4/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vmovdqu YMMWORD PTR [rcx+224], ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + vmovdqu YMMWORD PTR [rcx+608], ymm4 + vmovdqu YMMWORD PTR [rcx+736], ymm5 + vmovdqu YMMWORD PTR [rcx+864], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+192] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+448] + vmovdqu ymm4, YMMWORD PTR [rcx+576] + vmovdqu ymm5, YMMWORD PTR [rcx+704] + vmovdqu ymm6, YMMWORD PTR [rcx+832] + vmovdqu ymm7, YMMWORD PTR [rcx+960] + ; 64: 3/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 3/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+192], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+448], ymm3 + vmovdqu YMMWORD PTR [rcx+576], ymm4 + vmovdqu YMMWORD PTR [rcx+704], ymm5 + vmovdqu YMMWORD PTR [rcx+832], ymm6 + vmovdqu YMMWORD PTR [rcx+960], ymm7 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm0, YMMWORD PTR [rcx+32] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+288] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vmovdqu ymm4, YMMWORD PTR [rcx+544] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+800] + vmovdqu ymm7, YMMWORD PTR [rcx+928] + ; 64: 2/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 2/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm2 + vmovdqu YMMWORD PTR [rcx+416], ymm3 + vmovdqu YMMWORD PTR [rcx+544], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+800], ymm6 + vmovdqu YMMWORD PTR [rcx+928], ymm7 + vmovdqu ymm10, YMMWORD PTR [rdx+4864] + vmovdqu ymm12, YMMWORD PTR [rdx+4896] + vmovdqu ymm11, YMMWORD PTR [rdx+4928] + vmovdqu ymm13, YMMWORD PTR [rdx+4960] + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+128] + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+384] + vmovdqu ymm4, YMMWORD PTR [rcx+512] + vmovdqu ymm5, YMMWORD PTR [rcx+640] + vmovdqu ymm6, YMMWORD PTR [rcx+768] + vmovdqu ymm7, YMMWORD PTR [rcx+896] + ; 64: 1/4 + vpsubd ymm8, ymm0, ymm2 + vpaddd ymm0, ymm0, ymm2 + vpmulld ymm2, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm2 + vpmuldq ymm2, ymm2, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm2, ymm8, ymm2 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm2, ymm2 + vpblendd ymm2, ymm2, ymm15, 170 + vpsubd ymm8, ymm1, ymm3 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm3, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm3 + vpmuldq ymm3, ymm3, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm3, ymm8, ymm3 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm3, ymm3 + vpblendd ymm3, ymm3, ymm15, 170 + vmovdqu ymm10, YMMWORD PTR [rdx+4992] + vmovdqu ymm12, YMMWORD PTR [rdx+5024] + vpsubd ymm8, ymm4, ymm6 + vpaddd ymm4, ymm4, ymm6 + vpmulld ymm6, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm5, ymm7 + vpaddd ymm5, ymm5, ymm7 + vpmulld ymm7, ymm8, ymm13 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + ; 128: 1/4 + vpsubd ymm8, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm4 + vpmulld ymm4, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm4 + vpmuldq ymm4, ymm4, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm4, ymm8, ymm4 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm4, ymm4 + vpblendd ymm4, ymm4, ymm15, 170 + vpsubd ymm8, ymm1, ymm5 + vpaddd ymm1, ymm1, ymm5 + vpmulld ymm5, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm5 + vpmuldq ymm5, ymm5, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm5, ymm8, ymm5 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm5, ymm5 + vpblendd ymm5, ymm5, ymm15, 170 + vmovdqu ymm11, YMMWORD PTR [rdx+5056] + vmovdqu ymm13, YMMWORD PTR [rdx+5088] + vpsubd ymm8, ymm2, ymm6 + vpaddd ymm2, ymm2, ymm6 + vpmulld ymm6, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm6 + vpmuldq ymm6, ymm6, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm6, ymm8, ymm6 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm6, ymm6 + vpblendd ymm6, ymm6, ymm15, 170 + vpsubd ymm8, ymm3, ymm7 + vpaddd ymm3, ymm3, ymm7 + vpmulld ymm7, ymm8, ymm12 + vmovshdup ymm9, ymm8 + vpmuldq ymm8, ymm8, ymm10 + vpmuldq ymm9, ymm9, ymm10 + vmovshdup ymm15, ymm7 + vpmuldq ymm7, ymm7, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm7, ymm8, ymm7 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm7, ymm7 + vpblendd ymm7, ymm7, ymm15, 170 + vpmulld ymm8, ymm0, ymm13 + vpmulld ymm10, ymm1, ymm13 + vmovshdup ymm9, ymm0 + vmovshdup ymm12, ymm1 + vpmuldq ymm0, ymm0, ymm11 + vpmuldq ymm1, ymm1, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm0, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm1, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm0, ymm8, ymm15, 170 + vpblendd ymm1, ymm10, ymm9, 170 + vpmulld ymm8, ymm2, ymm13 + vpmulld ymm10, ymm3, ymm13 + vmovshdup ymm9, ymm2 + vmovshdup ymm12, ymm3 + vpmuldq ymm2, ymm2, ymm11 + vpmuldq ymm3, ymm3, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm2, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm3, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm2, ymm8, ymm15, 170 + vpblendd ymm3, ymm10, ymm9, 170 + vpmulld ymm8, ymm4, ymm13 + vpmulld ymm10, ymm5, ymm13 + vmovshdup ymm9, ymm4 + vmovshdup ymm12, ymm5 + vpmuldq ymm4, ymm4, ymm11 + vpmuldq ymm5, ymm5, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm4, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm5, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm4, ymm8, ymm15, 170 + vpblendd ymm5, ymm10, ymm9, 170 + vpmulld ymm8, ymm6, ymm13 + vpmulld ymm10, ymm7, ymm13 + vmovshdup ymm9, ymm6 + vmovshdup ymm12, ymm7 + vpmuldq ymm6, ymm6, ymm11 + vpmuldq ymm7, ymm7, ymm11 + vpmuldq ymm9, ymm9, ymm11 + vpmuldq ymm12, ymm12, ymm11 + vmovshdup ymm15, ymm8 + vpmuldq ymm8, ymm8, ymm14 + vpmuldq ymm15, ymm15, ymm14 + vpsubq ymm8, ymm6, ymm8 + vpsubq ymm15, ymm9, ymm15 + vmovshdup ymm9, ymm10 + vpmuldq ymm10, ymm10, ymm14 + vpmuldq ymm9, ymm9, ymm14 + vpsubq ymm10, ymm7, ymm10 + vpsubq ymm9, ymm12, ymm9 + vmovshdup ymm8, ymm8 + vmovshdup ymm10, ymm10 + vpblendd ymm6, ymm8, ymm15, 170 + vpblendd ymm7, ymm10, ymm9, 170 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+256], ymm2 + vmovdqu YMMWORD PTR [rcx+384], ymm3 + vmovdqu YMMWORD PTR [rcx+512], ymm4 + vmovdqu YMMWORD PTR [rcx+640], ymm5 + vmovdqu YMMWORD PTR [rcx+768], ymm6 + vmovdqu YMMWORD PTR [rcx+896], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_invntt_full_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_mul_avx2 PROC + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR mldsa_q + vmovdqu ymm9, YMMWORD PTR mldsa_qinv + ; 0..15 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm2, YMMWORD PTR [rdx+32] + vmovdqu ymm4, YMMWORD PTR [r8] + vmovdqu ymm6, YMMWORD PTR [r8+32] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + ; 16..31 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm2, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [r8+64] + vmovdqu ymm6, YMMWORD PTR [r8+96] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + ; 32..47 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm2, YMMWORD PTR [rdx+160] + vmovdqu ymm4, YMMWORD PTR [r8+128] + vmovdqu ymm6, YMMWORD PTR [r8+160] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm2 + ; 48..63 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm2, YMMWORD PTR [rdx+224] + vmovdqu ymm4, YMMWORD PTR [r8+192] + vmovdqu ymm6, YMMWORD PTR [r8+224] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+192], ymm0 + vmovdqu YMMWORD PTR [rcx+224], ymm2 + ; 64..79 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm2, YMMWORD PTR [rdx+288] + vmovdqu ymm4, YMMWORD PTR [r8+256] + vmovdqu ymm6, YMMWORD PTR [r8+288] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm2 + ; 80..95 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm2, YMMWORD PTR [rdx+352] + vmovdqu ymm4, YMMWORD PTR [r8+320] + vmovdqu ymm6, YMMWORD PTR [r8+352] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+320], ymm0 + vmovdqu YMMWORD PTR [rcx+352], ymm2 + ; 96..111 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm2, YMMWORD PTR [rdx+416] + vmovdqu ymm4, YMMWORD PTR [r8+384] + vmovdqu ymm6, YMMWORD PTR [r8+416] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + vmovdqu YMMWORD PTR [rcx+416], ymm2 + ; 112..127 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm2, YMMWORD PTR [rdx+480] + vmovdqu ymm4, YMMWORD PTR [r8+448] + vmovdqu ymm6, YMMWORD PTR [r8+480] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+448], ymm0 + vmovdqu YMMWORD PTR [rcx+480], ymm2 + ; 128..143 + vmovdqu ymm0, YMMWORD PTR [rdx+512] + vmovdqu ymm2, YMMWORD PTR [rdx+544] + vmovdqu ymm4, YMMWORD PTR [r8+512] + vmovdqu ymm6, YMMWORD PTR [r8+544] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm2 + ; 144..159 + vmovdqu ymm0, YMMWORD PTR [rdx+576] + vmovdqu ymm2, YMMWORD PTR [rdx+608] + vmovdqu ymm4, YMMWORD PTR [r8+576] + vmovdqu ymm6, YMMWORD PTR [r8+608] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+576], ymm0 + vmovdqu YMMWORD PTR [rcx+608], ymm2 + ; 160..175 + vmovdqu ymm0, YMMWORD PTR [rdx+640] + vmovdqu ymm2, YMMWORD PTR [rdx+672] + vmovdqu ymm4, YMMWORD PTR [r8+640] + vmovdqu ymm6, YMMWORD PTR [r8+672] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm0 + vmovdqu YMMWORD PTR [rcx+672], ymm2 + ; 176..191 + vmovdqu ymm0, YMMWORD PTR [rdx+704] + vmovdqu ymm2, YMMWORD PTR [rdx+736] + vmovdqu ymm4, YMMWORD PTR [r8+704] + vmovdqu ymm6, YMMWORD PTR [r8+736] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+704], ymm0 + vmovdqu YMMWORD PTR [rcx+736], ymm2 + ; 192..207 + vmovdqu ymm0, YMMWORD PTR [rdx+768] + vmovdqu ymm2, YMMWORD PTR [rdx+800] + vmovdqu ymm4, YMMWORD PTR [r8+768] + vmovdqu ymm6, YMMWORD PTR [r8+800] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm2 + ; 208..223 + vmovdqu ymm0, YMMWORD PTR [rdx+832] + vmovdqu ymm2, YMMWORD PTR [rdx+864] + vmovdqu ymm4, YMMWORD PTR [r8+832] + vmovdqu ymm6, YMMWORD PTR [r8+864] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+832], ymm0 + vmovdqu YMMWORD PTR [rcx+864], ymm2 + ; 224..239 + vmovdqu ymm0, YMMWORD PTR [rdx+896] + vmovdqu ymm2, YMMWORD PTR [rdx+928] + vmovdqu ymm4, YMMWORD PTR [r8+896] + vmovdqu ymm6, YMMWORD PTR [r8+928] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+896], ymm0 + vmovdqu YMMWORD PTR [rcx+928], ymm2 + ; 240..255 + vmovdqu ymm0, YMMWORD PTR [rdx+960] + vmovdqu ymm2, YMMWORD PTR [rdx+992] + vmovdqu ymm4, YMMWORD PTR [r8+960] + vmovdqu ymm6, YMMWORD PTR [r8+992] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpmuldq ymm0, ymm0, ymm4 + vpmuldq ymm1, ymm1, ymm5 + vpmuldq ymm2, ymm2, ymm6 + vpmuldq ymm3, ymm3, ymm7 + ; Mont Reduce 2 + vpmulld ymm4, ymm0, ymm9 + vpmulld ymm5, ymm1, ymm9 + vpmulld ymm6, ymm2, ymm9 + vpmulld ymm7, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm8 + vpmuldq ymm5, ymm5, ymm8 + vpmuldq ymm6, ymm6, ymm8 + vpmuldq ymm7, ymm7, ymm8 + vpsubd ymm0, ymm0, ymm4 + vpsubd ymm1, ymm1, ymm5 + vpsubd ymm2, ymm2, ymm6 + vpsubd ymm3, ymm3, ymm7 + vpsrlq ymm0, ymm0, 32 + vpsrlq ymm2, ymm2, 32 + vpor ymm0, ymm0, ymm1 + vpor ymm2, ymm2, ymm3 + vmovdqu YMMWORD PTR [rcx+960], ymm0 + vmovdqu YMMWORD PTR [rcx+992], ymm2 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + ret +wc_mldsa_mul_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_mul_vec_4_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vpxor ymm12, ymm12, ymm12 + vmovdqu ymm12, YMMWORD PTR mldsa_q + vmovdqu ymm13, YMMWORD PTR mldsa_qinv + ; 0..7 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm2, YMMWORD PTR [rdx+1024] + vmovdqu ymm6, YMMWORD PTR [r8] + vmovdqu ymm8, YMMWORD PTR [r8+1024] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2048] + vmovdqu ymm4, YMMWORD PTR [rdx+3072] + vmovdqu ymm8, YMMWORD PTR [r8+2048] + vmovdqu ymm10, YMMWORD PTR [r8+3072] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx], ymm0 + ; 8..15 + vmovdqu ymm0, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+1056] + vmovdqu ymm6, YMMWORD PTR [r8+32] + vmovdqu ymm8, YMMWORD PTR [r8+1056] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2080] + vmovdqu ymm4, YMMWORD PTR [rdx+3104] + vmovdqu ymm8, YMMWORD PTR [r8+2080] + vmovdqu ymm10, YMMWORD PTR [r8+3104] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + ; 16..23 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm2, YMMWORD PTR [rdx+1088] + vmovdqu ymm6, YMMWORD PTR [r8+64] + vmovdqu ymm8, YMMWORD PTR [r8+1088] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2112] + vmovdqu ymm4, YMMWORD PTR [rdx+3136] + vmovdqu ymm8, YMMWORD PTR [r8+2112] + vmovdqu ymm10, YMMWORD PTR [r8+3136] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + ; 24..31 + vmovdqu ymm0, YMMWORD PTR [rdx+96] + vmovdqu ymm2, YMMWORD PTR [rdx+1120] + vmovdqu ymm6, YMMWORD PTR [r8+96] + vmovdqu ymm8, YMMWORD PTR [r8+1120] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2144] + vmovdqu ymm4, YMMWORD PTR [rdx+3168] + vmovdqu ymm8, YMMWORD PTR [r8+2144] + vmovdqu ymm10, YMMWORD PTR [r8+3168] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + ; 32..39 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm2, YMMWORD PTR [rdx+1152] + vmovdqu ymm6, YMMWORD PTR [r8+128] + vmovdqu ymm8, YMMWORD PTR [r8+1152] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2176] + vmovdqu ymm4, YMMWORD PTR [rdx+3200] + vmovdqu ymm8, YMMWORD PTR [r8+2176] + vmovdqu ymm10, YMMWORD PTR [r8+3200] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + ; 40..47 + vmovdqu ymm0, YMMWORD PTR [rdx+160] + vmovdqu ymm2, YMMWORD PTR [rdx+1184] + vmovdqu ymm6, YMMWORD PTR [r8+160] + vmovdqu ymm8, YMMWORD PTR [r8+1184] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2208] + vmovdqu ymm4, YMMWORD PTR [rdx+3232] + vmovdqu ymm8, YMMWORD PTR [r8+2208] + vmovdqu ymm10, YMMWORD PTR [r8+3232] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+160], ymm0 + ; 48..55 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm2, YMMWORD PTR [rdx+1216] + vmovdqu ymm6, YMMWORD PTR [r8+192] + vmovdqu ymm8, YMMWORD PTR [r8+1216] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2240] + vmovdqu ymm4, YMMWORD PTR [rdx+3264] + vmovdqu ymm8, YMMWORD PTR [r8+2240] + vmovdqu ymm10, YMMWORD PTR [r8+3264] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm0 + ; 56..63 + vmovdqu ymm0, YMMWORD PTR [rdx+224] + vmovdqu ymm2, YMMWORD PTR [rdx+1248] + vmovdqu ymm6, YMMWORD PTR [r8+224] + vmovdqu ymm8, YMMWORD PTR [r8+1248] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2272] + vmovdqu ymm4, YMMWORD PTR [rdx+3296] + vmovdqu ymm8, YMMWORD PTR [r8+2272] + vmovdqu ymm10, YMMWORD PTR [r8+3296] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+224], ymm0 + ; 64..71 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm2, YMMWORD PTR [rdx+1280] + vmovdqu ymm6, YMMWORD PTR [r8+256] + vmovdqu ymm8, YMMWORD PTR [r8+1280] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2304] + vmovdqu ymm4, YMMWORD PTR [rdx+3328] + vmovdqu ymm8, YMMWORD PTR [r8+2304] + vmovdqu ymm10, YMMWORD PTR [r8+3328] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + ; 72..79 + vmovdqu ymm0, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+1312] + vmovdqu ymm6, YMMWORD PTR [r8+288] + vmovdqu ymm8, YMMWORD PTR [r8+1312] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2336] + vmovdqu ymm4, YMMWORD PTR [rdx+3360] + vmovdqu ymm8, YMMWORD PTR [r8+2336] + vmovdqu ymm10, YMMWORD PTR [r8+3360] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm0 + ; 80..87 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm2, YMMWORD PTR [rdx+1344] + vmovdqu ymm6, YMMWORD PTR [r8+320] + vmovdqu ymm8, YMMWORD PTR [r8+1344] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2368] + vmovdqu ymm4, YMMWORD PTR [rdx+3392] + vmovdqu ymm8, YMMWORD PTR [r8+2368] + vmovdqu ymm10, YMMWORD PTR [r8+3392] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm0 + ; 88..95 + vmovdqu ymm0, YMMWORD PTR [rdx+352] + vmovdqu ymm2, YMMWORD PTR [rdx+1376] + vmovdqu ymm6, YMMWORD PTR [r8+352] + vmovdqu ymm8, YMMWORD PTR [r8+1376] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2400] + vmovdqu ymm4, YMMWORD PTR [rdx+3424] + vmovdqu ymm8, YMMWORD PTR [r8+2400] + vmovdqu ymm10, YMMWORD PTR [r8+3424] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm0 + ; 96..103 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm2, YMMWORD PTR [rdx+1408] + vmovdqu ymm6, YMMWORD PTR [r8+384] + vmovdqu ymm8, YMMWORD PTR [r8+1408] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2432] + vmovdqu ymm4, YMMWORD PTR [rdx+3456] + vmovdqu ymm8, YMMWORD PTR [r8+2432] + vmovdqu ymm10, YMMWORD PTR [r8+3456] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + ; 104..111 + vmovdqu ymm0, YMMWORD PTR [rdx+416] + vmovdqu ymm2, YMMWORD PTR [rdx+1440] + vmovdqu ymm6, YMMWORD PTR [r8+416] + vmovdqu ymm8, YMMWORD PTR [r8+1440] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2464] + vmovdqu ymm4, YMMWORD PTR [rdx+3488] + vmovdqu ymm8, YMMWORD PTR [r8+2464] + vmovdqu ymm10, YMMWORD PTR [r8+3488] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+416], ymm0 + ; 112..119 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm2, YMMWORD PTR [rdx+1472] + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm8, YMMWORD PTR [r8+1472] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2496] + vmovdqu ymm4, YMMWORD PTR [rdx+3520] + vmovdqu ymm8, YMMWORD PTR [r8+2496] + vmovdqu ymm10, YMMWORD PTR [r8+3520] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm0 + ; 120..127 + vmovdqu ymm0, YMMWORD PTR [rdx+480] + vmovdqu ymm2, YMMWORD PTR [rdx+1504] + vmovdqu ymm6, YMMWORD PTR [r8+480] + vmovdqu ymm8, YMMWORD PTR [r8+1504] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2528] + vmovdqu ymm4, YMMWORD PTR [rdx+3552] + vmovdqu ymm8, YMMWORD PTR [r8+2528] + vmovdqu ymm10, YMMWORD PTR [r8+3552] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+480], ymm0 + ; 128..135 + vmovdqu ymm0, YMMWORD PTR [rdx+512] + vmovdqu ymm2, YMMWORD PTR [rdx+1536] + vmovdqu ymm6, YMMWORD PTR [r8+512] + vmovdqu ymm8, YMMWORD PTR [r8+1536] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2560] + vmovdqu ymm4, YMMWORD PTR [rdx+3584] + vmovdqu ymm8, YMMWORD PTR [r8+2560] + vmovdqu ymm10, YMMWORD PTR [r8+3584] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + ; 136..143 + vmovdqu ymm0, YMMWORD PTR [rdx+544] + vmovdqu ymm2, YMMWORD PTR [rdx+1568] + vmovdqu ymm6, YMMWORD PTR [r8+544] + vmovdqu ymm8, YMMWORD PTR [r8+1568] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2592] + vmovdqu ymm4, YMMWORD PTR [rdx+3616] + vmovdqu ymm8, YMMWORD PTR [r8+2592] + vmovdqu ymm10, YMMWORD PTR [r8+3616] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+544], ymm0 + ; 144..151 + vmovdqu ymm0, YMMWORD PTR [rdx+576] + vmovdqu ymm2, YMMWORD PTR [rdx+1600] + vmovdqu ymm6, YMMWORD PTR [r8+576] + vmovdqu ymm8, YMMWORD PTR [r8+1600] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2624] + vmovdqu ymm4, YMMWORD PTR [rdx+3648] + vmovdqu ymm8, YMMWORD PTR [r8+2624] + vmovdqu ymm10, YMMWORD PTR [r8+3648] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm0 + ; 152..159 + vmovdqu ymm0, YMMWORD PTR [rdx+608] + vmovdqu ymm2, YMMWORD PTR [rdx+1632] + vmovdqu ymm6, YMMWORD PTR [r8+608] + vmovdqu ymm8, YMMWORD PTR [r8+1632] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2656] + vmovdqu ymm4, YMMWORD PTR [rdx+3680] + vmovdqu ymm8, YMMWORD PTR [r8+2656] + vmovdqu ymm10, YMMWORD PTR [r8+3680] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+608], ymm0 + ; 160..167 + vmovdqu ymm0, YMMWORD PTR [rdx+640] + vmovdqu ymm2, YMMWORD PTR [rdx+1664] + vmovdqu ymm6, YMMWORD PTR [r8+640] + vmovdqu ymm8, YMMWORD PTR [r8+1664] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2688] + vmovdqu ymm4, YMMWORD PTR [rdx+3712] + vmovdqu ymm8, YMMWORD PTR [r8+2688] + vmovdqu ymm10, YMMWORD PTR [r8+3712] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+640], ymm0 + ; 168..175 + vmovdqu ymm0, YMMWORD PTR [rdx+672] + vmovdqu ymm2, YMMWORD PTR [rdx+1696] + vmovdqu ymm6, YMMWORD PTR [r8+672] + vmovdqu ymm8, YMMWORD PTR [r8+1696] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2720] + vmovdqu ymm4, YMMWORD PTR [rdx+3744] + vmovdqu ymm8, YMMWORD PTR [r8+2720] + vmovdqu ymm10, YMMWORD PTR [r8+3744] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+672], ymm0 + ; 176..183 + vmovdqu ymm0, YMMWORD PTR [rdx+704] + vmovdqu ymm2, YMMWORD PTR [rdx+1728] + vmovdqu ymm6, YMMWORD PTR [r8+704] + vmovdqu ymm8, YMMWORD PTR [r8+1728] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2752] + vmovdqu ymm4, YMMWORD PTR [rdx+3776] + vmovdqu ymm8, YMMWORD PTR [r8+2752] + vmovdqu ymm10, YMMWORD PTR [r8+3776] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+704], ymm0 + ; 184..191 + vmovdqu ymm0, YMMWORD PTR [rdx+736] + vmovdqu ymm2, YMMWORD PTR [rdx+1760] + vmovdqu ymm6, YMMWORD PTR [r8+736] + vmovdqu ymm8, YMMWORD PTR [r8+1760] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2784] + vmovdqu ymm4, YMMWORD PTR [rdx+3808] + vmovdqu ymm8, YMMWORD PTR [r8+2784] + vmovdqu ymm10, YMMWORD PTR [r8+3808] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+736], ymm0 + ; 192..199 + vmovdqu ymm0, YMMWORD PTR [rdx+768] + vmovdqu ymm2, YMMWORD PTR [rdx+1792] + vmovdqu ymm6, YMMWORD PTR [r8+768] + vmovdqu ymm8, YMMWORD PTR [r8+1792] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2816] + vmovdqu ymm4, YMMWORD PTR [rdx+3840] + vmovdqu ymm8, YMMWORD PTR [r8+2816] + vmovdqu ymm10, YMMWORD PTR [r8+3840] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + ; 200..207 + vmovdqu ymm0, YMMWORD PTR [rdx+800] + vmovdqu ymm2, YMMWORD PTR [rdx+1824] + vmovdqu ymm6, YMMWORD PTR [r8+800] + vmovdqu ymm8, YMMWORD PTR [r8+1824] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2848] + vmovdqu ymm4, YMMWORD PTR [rdx+3872] + vmovdqu ymm8, YMMWORD PTR [r8+2848] + vmovdqu ymm10, YMMWORD PTR [r8+3872] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+800], ymm0 + ; 208..215 + vmovdqu ymm0, YMMWORD PTR [rdx+832] + vmovdqu ymm2, YMMWORD PTR [rdx+1856] + vmovdqu ymm6, YMMWORD PTR [r8+832] + vmovdqu ymm8, YMMWORD PTR [r8+1856] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2880] + vmovdqu ymm4, YMMWORD PTR [rdx+3904] + vmovdqu ymm8, YMMWORD PTR [r8+2880] + vmovdqu ymm10, YMMWORD PTR [r8+3904] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm0 + ; 216..223 + vmovdqu ymm0, YMMWORD PTR [rdx+864] + vmovdqu ymm2, YMMWORD PTR [rdx+1888] + vmovdqu ymm6, YMMWORD PTR [r8+864] + vmovdqu ymm8, YMMWORD PTR [r8+1888] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2912] + vmovdqu ymm4, YMMWORD PTR [rdx+3936] + vmovdqu ymm8, YMMWORD PTR [r8+2912] + vmovdqu ymm10, YMMWORD PTR [r8+3936] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+864], ymm0 + ; 224..231 + vmovdqu ymm0, YMMWORD PTR [rdx+896] + vmovdqu ymm2, YMMWORD PTR [rdx+1920] + vmovdqu ymm6, YMMWORD PTR [r8+896] + vmovdqu ymm8, YMMWORD PTR [r8+1920] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2944] + vmovdqu ymm4, YMMWORD PTR [rdx+3968] + vmovdqu ymm8, YMMWORD PTR [r8+2944] + vmovdqu ymm10, YMMWORD PTR [r8+3968] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+896], ymm0 + ; 232..239 + vmovdqu ymm0, YMMWORD PTR [rdx+928] + vmovdqu ymm2, YMMWORD PTR [rdx+1952] + vmovdqu ymm6, YMMWORD PTR [r8+928] + vmovdqu ymm8, YMMWORD PTR [r8+1952] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+2976] + vmovdqu ymm4, YMMWORD PTR [rdx+4000] + vmovdqu ymm8, YMMWORD PTR [r8+2976] + vmovdqu ymm10, YMMWORD PTR [r8+4000] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+928], ymm0 + ; 240..247 + vmovdqu ymm0, YMMWORD PTR [rdx+960] + vmovdqu ymm2, YMMWORD PTR [rdx+1984] + vmovdqu ymm6, YMMWORD PTR [r8+960] + vmovdqu ymm8, YMMWORD PTR [r8+1984] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+3008] + vmovdqu ymm4, YMMWORD PTR [rdx+4032] + vmovdqu ymm8, YMMWORD PTR [r8+3008] + vmovdqu ymm10, YMMWORD PTR [r8+4032] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+960], ymm0 + ; 248..255 + vmovdqu ymm0, YMMWORD PTR [rdx+992] + vmovdqu ymm2, YMMWORD PTR [rdx+2016] + vmovdqu ymm6, YMMWORD PTR [r8+992] + vmovdqu ymm8, YMMWORD PTR [r8+2016] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vmovdqu ymm2, YMMWORD PTR [rdx+3040] + vmovdqu ymm4, YMMWORD PTR [rdx+4064] + vmovdqu ymm8, YMMWORD PTR [r8+3040] + vmovdqu ymm10, YMMWORD PTR [r8+4064] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+992], ymm0 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +wc_mldsa_mul_vec_4_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_mul_vec_5_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vpxor ymm12, ymm12, ymm12 + vmovdqu ymm12, YMMWORD PTR mldsa_q + vmovdqu ymm13, YMMWORD PTR mldsa_qinv + ; 0..7 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm2, YMMWORD PTR [rdx+1024] + vmovdqu ymm4, YMMWORD PTR [rdx+2048] + vmovdqu ymm6, YMMWORD PTR [r8] + vmovdqu ymm8, YMMWORD PTR [r8+1024] + vmovdqu ymm10, YMMWORD PTR [r8+2048] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3072] + vmovdqu ymm4, YMMWORD PTR [rdx+4096] + vmovdqu ymm8, YMMWORD PTR [r8+3072] + vmovdqu ymm10, YMMWORD PTR [r8+4096] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx], ymm0 + ; 8..15 + vmovdqu ymm0, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+1056] + vmovdqu ymm4, YMMWORD PTR [rdx+2080] + vmovdqu ymm6, YMMWORD PTR [r8+32] + vmovdqu ymm8, YMMWORD PTR [r8+1056] + vmovdqu ymm10, YMMWORD PTR [r8+2080] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3104] + vmovdqu ymm4, YMMWORD PTR [rdx+4128] + vmovdqu ymm8, YMMWORD PTR [r8+3104] + vmovdqu ymm10, YMMWORD PTR [r8+4128] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + ; 16..23 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm2, YMMWORD PTR [rdx+1088] + vmovdqu ymm4, YMMWORD PTR [rdx+2112] + vmovdqu ymm6, YMMWORD PTR [r8+64] + vmovdqu ymm8, YMMWORD PTR [r8+1088] + vmovdqu ymm10, YMMWORD PTR [r8+2112] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3136] + vmovdqu ymm4, YMMWORD PTR [rdx+4160] + vmovdqu ymm8, YMMWORD PTR [r8+3136] + vmovdqu ymm10, YMMWORD PTR [r8+4160] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + ; 24..31 + vmovdqu ymm0, YMMWORD PTR [rdx+96] + vmovdqu ymm2, YMMWORD PTR [rdx+1120] + vmovdqu ymm4, YMMWORD PTR [rdx+2144] + vmovdqu ymm6, YMMWORD PTR [r8+96] + vmovdqu ymm8, YMMWORD PTR [r8+1120] + vmovdqu ymm10, YMMWORD PTR [r8+2144] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3168] + vmovdqu ymm4, YMMWORD PTR [rdx+4192] + vmovdqu ymm8, YMMWORD PTR [r8+3168] + vmovdqu ymm10, YMMWORD PTR [r8+4192] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + ; 32..39 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm2, YMMWORD PTR [rdx+1152] + vmovdqu ymm4, YMMWORD PTR [rdx+2176] + vmovdqu ymm6, YMMWORD PTR [r8+128] + vmovdqu ymm8, YMMWORD PTR [r8+1152] + vmovdqu ymm10, YMMWORD PTR [r8+2176] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3200] + vmovdqu ymm4, YMMWORD PTR [rdx+4224] + vmovdqu ymm8, YMMWORD PTR [r8+3200] + vmovdqu ymm10, YMMWORD PTR [r8+4224] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + ; 40..47 + vmovdqu ymm0, YMMWORD PTR [rdx+160] + vmovdqu ymm2, YMMWORD PTR [rdx+1184] + vmovdqu ymm4, YMMWORD PTR [rdx+2208] + vmovdqu ymm6, YMMWORD PTR [r8+160] + vmovdqu ymm8, YMMWORD PTR [r8+1184] + vmovdqu ymm10, YMMWORD PTR [r8+2208] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3232] + vmovdqu ymm4, YMMWORD PTR [rdx+4256] + vmovdqu ymm8, YMMWORD PTR [r8+3232] + vmovdqu ymm10, YMMWORD PTR [r8+4256] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+160], ymm0 + ; 48..55 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm2, YMMWORD PTR [rdx+1216] + vmovdqu ymm4, YMMWORD PTR [rdx+2240] + vmovdqu ymm6, YMMWORD PTR [r8+192] + vmovdqu ymm8, YMMWORD PTR [r8+1216] + vmovdqu ymm10, YMMWORD PTR [r8+2240] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3264] + vmovdqu ymm4, YMMWORD PTR [rdx+4288] + vmovdqu ymm8, YMMWORD PTR [r8+3264] + vmovdqu ymm10, YMMWORD PTR [r8+4288] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm0 + ; 56..63 + vmovdqu ymm0, YMMWORD PTR [rdx+224] + vmovdqu ymm2, YMMWORD PTR [rdx+1248] + vmovdqu ymm4, YMMWORD PTR [rdx+2272] + vmovdqu ymm6, YMMWORD PTR [r8+224] + vmovdqu ymm8, YMMWORD PTR [r8+1248] + vmovdqu ymm10, YMMWORD PTR [r8+2272] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3296] + vmovdqu ymm4, YMMWORD PTR [rdx+4320] + vmovdqu ymm8, YMMWORD PTR [r8+3296] + vmovdqu ymm10, YMMWORD PTR [r8+4320] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+224], ymm0 + ; 64..71 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm2, YMMWORD PTR [rdx+1280] + vmovdqu ymm4, YMMWORD PTR [rdx+2304] + vmovdqu ymm6, YMMWORD PTR [r8+256] + vmovdqu ymm8, YMMWORD PTR [r8+1280] + vmovdqu ymm10, YMMWORD PTR [r8+2304] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3328] + vmovdqu ymm4, YMMWORD PTR [rdx+4352] + vmovdqu ymm8, YMMWORD PTR [r8+3328] + vmovdqu ymm10, YMMWORD PTR [r8+4352] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + ; 72..79 + vmovdqu ymm0, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+1312] + vmovdqu ymm4, YMMWORD PTR [rdx+2336] + vmovdqu ymm6, YMMWORD PTR [r8+288] + vmovdqu ymm8, YMMWORD PTR [r8+1312] + vmovdqu ymm10, YMMWORD PTR [r8+2336] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3360] + vmovdqu ymm4, YMMWORD PTR [rdx+4384] + vmovdqu ymm8, YMMWORD PTR [r8+3360] + vmovdqu ymm10, YMMWORD PTR [r8+4384] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm0 + ; 80..87 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm2, YMMWORD PTR [rdx+1344] + vmovdqu ymm4, YMMWORD PTR [rdx+2368] + vmovdqu ymm6, YMMWORD PTR [r8+320] + vmovdqu ymm8, YMMWORD PTR [r8+1344] + vmovdqu ymm10, YMMWORD PTR [r8+2368] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3392] + vmovdqu ymm4, YMMWORD PTR [rdx+4416] + vmovdqu ymm8, YMMWORD PTR [r8+3392] + vmovdqu ymm10, YMMWORD PTR [r8+4416] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm0 + ; 88..95 + vmovdqu ymm0, YMMWORD PTR [rdx+352] + vmovdqu ymm2, YMMWORD PTR [rdx+1376] + vmovdqu ymm4, YMMWORD PTR [rdx+2400] + vmovdqu ymm6, YMMWORD PTR [r8+352] + vmovdqu ymm8, YMMWORD PTR [r8+1376] + vmovdqu ymm10, YMMWORD PTR [r8+2400] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3424] + vmovdqu ymm4, YMMWORD PTR [rdx+4448] + vmovdqu ymm8, YMMWORD PTR [r8+3424] + vmovdqu ymm10, YMMWORD PTR [r8+4448] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm0 + ; 96..103 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm2, YMMWORD PTR [rdx+1408] + vmovdqu ymm4, YMMWORD PTR [rdx+2432] + vmovdqu ymm6, YMMWORD PTR [r8+384] + vmovdqu ymm8, YMMWORD PTR [r8+1408] + vmovdqu ymm10, YMMWORD PTR [r8+2432] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3456] + vmovdqu ymm4, YMMWORD PTR [rdx+4480] + vmovdqu ymm8, YMMWORD PTR [r8+3456] + vmovdqu ymm10, YMMWORD PTR [r8+4480] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + ; 104..111 + vmovdqu ymm0, YMMWORD PTR [rdx+416] + vmovdqu ymm2, YMMWORD PTR [rdx+1440] + vmovdqu ymm4, YMMWORD PTR [rdx+2464] + vmovdqu ymm6, YMMWORD PTR [r8+416] + vmovdqu ymm8, YMMWORD PTR [r8+1440] + vmovdqu ymm10, YMMWORD PTR [r8+2464] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3488] + vmovdqu ymm4, YMMWORD PTR [rdx+4512] + vmovdqu ymm8, YMMWORD PTR [r8+3488] + vmovdqu ymm10, YMMWORD PTR [r8+4512] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+416], ymm0 + ; 112..119 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm2, YMMWORD PTR [rdx+1472] + vmovdqu ymm4, YMMWORD PTR [rdx+2496] + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm8, YMMWORD PTR [r8+1472] + vmovdqu ymm10, YMMWORD PTR [r8+2496] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3520] + vmovdqu ymm4, YMMWORD PTR [rdx+4544] + vmovdqu ymm8, YMMWORD PTR [r8+3520] + vmovdqu ymm10, YMMWORD PTR [r8+4544] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm0 + ; 120..127 + vmovdqu ymm0, YMMWORD PTR [rdx+480] + vmovdqu ymm2, YMMWORD PTR [rdx+1504] + vmovdqu ymm4, YMMWORD PTR [rdx+2528] + vmovdqu ymm6, YMMWORD PTR [r8+480] + vmovdqu ymm8, YMMWORD PTR [r8+1504] + vmovdqu ymm10, YMMWORD PTR [r8+2528] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3552] + vmovdqu ymm4, YMMWORD PTR [rdx+4576] + vmovdqu ymm8, YMMWORD PTR [r8+3552] + vmovdqu ymm10, YMMWORD PTR [r8+4576] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+480], ymm0 + ; 128..135 + vmovdqu ymm0, YMMWORD PTR [rdx+512] + vmovdqu ymm2, YMMWORD PTR [rdx+1536] + vmovdqu ymm4, YMMWORD PTR [rdx+2560] + vmovdqu ymm6, YMMWORD PTR [r8+512] + vmovdqu ymm8, YMMWORD PTR [r8+1536] + vmovdqu ymm10, YMMWORD PTR [r8+2560] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3584] + vmovdqu ymm4, YMMWORD PTR [rdx+4608] + vmovdqu ymm8, YMMWORD PTR [r8+3584] + vmovdqu ymm10, YMMWORD PTR [r8+4608] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + ; 136..143 + vmovdqu ymm0, YMMWORD PTR [rdx+544] + vmovdqu ymm2, YMMWORD PTR [rdx+1568] + vmovdqu ymm4, YMMWORD PTR [rdx+2592] + vmovdqu ymm6, YMMWORD PTR [r8+544] + vmovdqu ymm8, YMMWORD PTR [r8+1568] + vmovdqu ymm10, YMMWORD PTR [r8+2592] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3616] + vmovdqu ymm4, YMMWORD PTR [rdx+4640] + vmovdqu ymm8, YMMWORD PTR [r8+3616] + vmovdqu ymm10, YMMWORD PTR [r8+4640] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+544], ymm0 + ; 144..151 + vmovdqu ymm0, YMMWORD PTR [rdx+576] + vmovdqu ymm2, YMMWORD PTR [rdx+1600] + vmovdqu ymm4, YMMWORD PTR [rdx+2624] + vmovdqu ymm6, YMMWORD PTR [r8+576] + vmovdqu ymm8, YMMWORD PTR [r8+1600] + vmovdqu ymm10, YMMWORD PTR [r8+2624] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3648] + vmovdqu ymm4, YMMWORD PTR [rdx+4672] + vmovdqu ymm8, YMMWORD PTR [r8+3648] + vmovdqu ymm10, YMMWORD PTR [r8+4672] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm0 + ; 152..159 + vmovdqu ymm0, YMMWORD PTR [rdx+608] + vmovdqu ymm2, YMMWORD PTR [rdx+1632] + vmovdqu ymm4, YMMWORD PTR [rdx+2656] + vmovdqu ymm6, YMMWORD PTR [r8+608] + vmovdqu ymm8, YMMWORD PTR [r8+1632] + vmovdqu ymm10, YMMWORD PTR [r8+2656] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3680] + vmovdqu ymm4, YMMWORD PTR [rdx+4704] + vmovdqu ymm8, YMMWORD PTR [r8+3680] + vmovdqu ymm10, YMMWORD PTR [r8+4704] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+608], ymm0 + ; 160..167 + vmovdqu ymm0, YMMWORD PTR [rdx+640] + vmovdqu ymm2, YMMWORD PTR [rdx+1664] + vmovdqu ymm4, YMMWORD PTR [rdx+2688] + vmovdqu ymm6, YMMWORD PTR [r8+640] + vmovdqu ymm8, YMMWORD PTR [r8+1664] + vmovdqu ymm10, YMMWORD PTR [r8+2688] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3712] + vmovdqu ymm4, YMMWORD PTR [rdx+4736] + vmovdqu ymm8, YMMWORD PTR [r8+3712] + vmovdqu ymm10, YMMWORD PTR [r8+4736] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+640], ymm0 + ; 168..175 + vmovdqu ymm0, YMMWORD PTR [rdx+672] + vmovdqu ymm2, YMMWORD PTR [rdx+1696] + vmovdqu ymm4, YMMWORD PTR [rdx+2720] + vmovdqu ymm6, YMMWORD PTR [r8+672] + vmovdqu ymm8, YMMWORD PTR [r8+1696] + vmovdqu ymm10, YMMWORD PTR [r8+2720] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3744] + vmovdqu ymm4, YMMWORD PTR [rdx+4768] + vmovdqu ymm8, YMMWORD PTR [r8+3744] + vmovdqu ymm10, YMMWORD PTR [r8+4768] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+672], ymm0 + ; 176..183 + vmovdqu ymm0, YMMWORD PTR [rdx+704] + vmovdqu ymm2, YMMWORD PTR [rdx+1728] + vmovdqu ymm4, YMMWORD PTR [rdx+2752] + vmovdqu ymm6, YMMWORD PTR [r8+704] + vmovdqu ymm8, YMMWORD PTR [r8+1728] + vmovdqu ymm10, YMMWORD PTR [r8+2752] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3776] + vmovdqu ymm4, YMMWORD PTR [rdx+4800] + vmovdqu ymm8, YMMWORD PTR [r8+3776] + vmovdqu ymm10, YMMWORD PTR [r8+4800] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+704], ymm0 + ; 184..191 + vmovdqu ymm0, YMMWORD PTR [rdx+736] + vmovdqu ymm2, YMMWORD PTR [rdx+1760] + vmovdqu ymm4, YMMWORD PTR [rdx+2784] + vmovdqu ymm6, YMMWORD PTR [r8+736] + vmovdqu ymm8, YMMWORD PTR [r8+1760] + vmovdqu ymm10, YMMWORD PTR [r8+2784] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3808] + vmovdqu ymm4, YMMWORD PTR [rdx+4832] + vmovdqu ymm8, YMMWORD PTR [r8+3808] + vmovdqu ymm10, YMMWORD PTR [r8+4832] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+736], ymm0 + ; 192..199 + vmovdqu ymm0, YMMWORD PTR [rdx+768] + vmovdqu ymm2, YMMWORD PTR [rdx+1792] + vmovdqu ymm4, YMMWORD PTR [rdx+2816] + vmovdqu ymm6, YMMWORD PTR [r8+768] + vmovdqu ymm8, YMMWORD PTR [r8+1792] + vmovdqu ymm10, YMMWORD PTR [r8+2816] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3840] + vmovdqu ymm4, YMMWORD PTR [rdx+4864] + vmovdqu ymm8, YMMWORD PTR [r8+3840] + vmovdqu ymm10, YMMWORD PTR [r8+4864] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + ; 200..207 + vmovdqu ymm0, YMMWORD PTR [rdx+800] + vmovdqu ymm2, YMMWORD PTR [rdx+1824] + vmovdqu ymm4, YMMWORD PTR [rdx+2848] + vmovdqu ymm6, YMMWORD PTR [r8+800] + vmovdqu ymm8, YMMWORD PTR [r8+1824] + vmovdqu ymm10, YMMWORD PTR [r8+2848] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3872] + vmovdqu ymm4, YMMWORD PTR [rdx+4896] + vmovdqu ymm8, YMMWORD PTR [r8+3872] + vmovdqu ymm10, YMMWORD PTR [r8+4896] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+800], ymm0 + ; 208..215 + vmovdqu ymm0, YMMWORD PTR [rdx+832] + vmovdqu ymm2, YMMWORD PTR [rdx+1856] + vmovdqu ymm4, YMMWORD PTR [rdx+2880] + vmovdqu ymm6, YMMWORD PTR [r8+832] + vmovdqu ymm8, YMMWORD PTR [r8+1856] + vmovdqu ymm10, YMMWORD PTR [r8+2880] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3904] + vmovdqu ymm4, YMMWORD PTR [rdx+4928] + vmovdqu ymm8, YMMWORD PTR [r8+3904] + vmovdqu ymm10, YMMWORD PTR [r8+4928] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm0 + ; 216..223 + vmovdqu ymm0, YMMWORD PTR [rdx+864] + vmovdqu ymm2, YMMWORD PTR [rdx+1888] + vmovdqu ymm4, YMMWORD PTR [rdx+2912] + vmovdqu ymm6, YMMWORD PTR [r8+864] + vmovdqu ymm8, YMMWORD PTR [r8+1888] + vmovdqu ymm10, YMMWORD PTR [r8+2912] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3936] + vmovdqu ymm4, YMMWORD PTR [rdx+4960] + vmovdqu ymm8, YMMWORD PTR [r8+3936] + vmovdqu ymm10, YMMWORD PTR [r8+4960] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+864], ymm0 + ; 224..231 + vmovdqu ymm0, YMMWORD PTR [rdx+896] + vmovdqu ymm2, YMMWORD PTR [rdx+1920] + vmovdqu ymm4, YMMWORD PTR [rdx+2944] + vmovdqu ymm6, YMMWORD PTR [r8+896] + vmovdqu ymm8, YMMWORD PTR [r8+1920] + vmovdqu ymm10, YMMWORD PTR [r8+2944] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3968] + vmovdqu ymm4, YMMWORD PTR [rdx+4992] + vmovdqu ymm8, YMMWORD PTR [r8+3968] + vmovdqu ymm10, YMMWORD PTR [r8+4992] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+896], ymm0 + ; 232..239 + vmovdqu ymm0, YMMWORD PTR [rdx+928] + vmovdqu ymm2, YMMWORD PTR [rdx+1952] + vmovdqu ymm4, YMMWORD PTR [rdx+2976] + vmovdqu ymm6, YMMWORD PTR [r8+928] + vmovdqu ymm8, YMMWORD PTR [r8+1952] + vmovdqu ymm10, YMMWORD PTR [r8+2976] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+4000] + vmovdqu ymm4, YMMWORD PTR [rdx+5024] + vmovdqu ymm8, YMMWORD PTR [r8+4000] + vmovdqu ymm10, YMMWORD PTR [r8+5024] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+928], ymm0 + ; 240..247 + vmovdqu ymm0, YMMWORD PTR [rdx+960] + vmovdqu ymm2, YMMWORD PTR [rdx+1984] + vmovdqu ymm4, YMMWORD PTR [rdx+3008] + vmovdqu ymm6, YMMWORD PTR [r8+960] + vmovdqu ymm8, YMMWORD PTR [r8+1984] + vmovdqu ymm10, YMMWORD PTR [r8+3008] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+4032] + vmovdqu ymm4, YMMWORD PTR [rdx+5056] + vmovdqu ymm8, YMMWORD PTR [r8+4032] + vmovdqu ymm10, YMMWORD PTR [r8+5056] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+960], ymm0 + ; 248..255 + vmovdqu ymm0, YMMWORD PTR [rdx+992] + vmovdqu ymm2, YMMWORD PTR [rdx+2016] + vmovdqu ymm4, YMMWORD PTR [rdx+3040] + vmovdqu ymm6, YMMWORD PTR [r8+992] + vmovdqu ymm8, YMMWORD PTR [r8+2016] + vmovdqu ymm10, YMMWORD PTR [r8+3040] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+4064] + vmovdqu ymm4, YMMWORD PTR [rdx+5088] + vmovdqu ymm8, YMMWORD PTR [r8+4064] + vmovdqu ymm10, YMMWORD PTR [r8+5088] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+992], ymm0 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +wc_mldsa_mul_vec_5_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_mul_vec_7_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vpxor ymm12, ymm12, ymm12 + vmovdqu ymm12, YMMWORD PTR mldsa_q + vmovdqu ymm13, YMMWORD PTR mldsa_qinv + ; 0..7 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm2, YMMWORD PTR [rdx+1024] + vmovdqu ymm4, YMMWORD PTR [rdx+2048] + vmovdqu ymm6, YMMWORD PTR [r8] + vmovdqu ymm8, YMMWORD PTR [r8+1024] + vmovdqu ymm10, YMMWORD PTR [r8+2048] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3072] + vmovdqu ymm4, YMMWORD PTR [rdx+4096] + vmovdqu ymm8, YMMWORD PTR [r8+3072] + vmovdqu ymm10, YMMWORD PTR [r8+4096] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5120] + vmovdqu ymm4, YMMWORD PTR [rdx+6144] + vmovdqu ymm8, YMMWORD PTR [r8+5120] + vmovdqu ymm10, YMMWORD PTR [r8+6144] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx], ymm0 + ; 8..15 + vmovdqu ymm0, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+1056] + vmovdqu ymm4, YMMWORD PTR [rdx+2080] + vmovdqu ymm6, YMMWORD PTR [r8+32] + vmovdqu ymm8, YMMWORD PTR [r8+1056] + vmovdqu ymm10, YMMWORD PTR [r8+2080] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3104] + vmovdqu ymm4, YMMWORD PTR [rdx+4128] + vmovdqu ymm8, YMMWORD PTR [r8+3104] + vmovdqu ymm10, YMMWORD PTR [r8+4128] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5152] + vmovdqu ymm4, YMMWORD PTR [rdx+6176] + vmovdqu ymm8, YMMWORD PTR [r8+5152] + vmovdqu ymm10, YMMWORD PTR [r8+6176] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + ; 16..23 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm2, YMMWORD PTR [rdx+1088] + vmovdqu ymm4, YMMWORD PTR [rdx+2112] + vmovdqu ymm6, YMMWORD PTR [r8+64] + vmovdqu ymm8, YMMWORD PTR [r8+1088] + vmovdqu ymm10, YMMWORD PTR [r8+2112] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3136] + vmovdqu ymm4, YMMWORD PTR [rdx+4160] + vmovdqu ymm8, YMMWORD PTR [r8+3136] + vmovdqu ymm10, YMMWORD PTR [r8+4160] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5184] + vmovdqu ymm4, YMMWORD PTR [rdx+6208] + vmovdqu ymm8, YMMWORD PTR [r8+5184] + vmovdqu ymm10, YMMWORD PTR [r8+6208] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + ; 24..31 + vmovdqu ymm0, YMMWORD PTR [rdx+96] + vmovdqu ymm2, YMMWORD PTR [rdx+1120] + vmovdqu ymm4, YMMWORD PTR [rdx+2144] + vmovdqu ymm6, YMMWORD PTR [r8+96] + vmovdqu ymm8, YMMWORD PTR [r8+1120] + vmovdqu ymm10, YMMWORD PTR [r8+2144] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3168] + vmovdqu ymm4, YMMWORD PTR [rdx+4192] + vmovdqu ymm8, YMMWORD PTR [r8+3168] + vmovdqu ymm10, YMMWORD PTR [r8+4192] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5216] + vmovdqu ymm4, YMMWORD PTR [rdx+6240] + vmovdqu ymm8, YMMWORD PTR [r8+5216] + vmovdqu ymm10, YMMWORD PTR [r8+6240] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + ; 32..39 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm2, YMMWORD PTR [rdx+1152] + vmovdqu ymm4, YMMWORD PTR [rdx+2176] + vmovdqu ymm6, YMMWORD PTR [r8+128] + vmovdqu ymm8, YMMWORD PTR [r8+1152] + vmovdqu ymm10, YMMWORD PTR [r8+2176] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3200] + vmovdqu ymm4, YMMWORD PTR [rdx+4224] + vmovdqu ymm8, YMMWORD PTR [r8+3200] + vmovdqu ymm10, YMMWORD PTR [r8+4224] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5248] + vmovdqu ymm4, YMMWORD PTR [rdx+6272] + vmovdqu ymm8, YMMWORD PTR [r8+5248] + vmovdqu ymm10, YMMWORD PTR [r8+6272] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + ; 40..47 + vmovdqu ymm0, YMMWORD PTR [rdx+160] + vmovdqu ymm2, YMMWORD PTR [rdx+1184] + vmovdqu ymm4, YMMWORD PTR [rdx+2208] + vmovdqu ymm6, YMMWORD PTR [r8+160] + vmovdqu ymm8, YMMWORD PTR [r8+1184] + vmovdqu ymm10, YMMWORD PTR [r8+2208] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3232] + vmovdqu ymm4, YMMWORD PTR [rdx+4256] + vmovdqu ymm8, YMMWORD PTR [r8+3232] + vmovdqu ymm10, YMMWORD PTR [r8+4256] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5280] + vmovdqu ymm4, YMMWORD PTR [rdx+6304] + vmovdqu ymm8, YMMWORD PTR [r8+5280] + vmovdqu ymm10, YMMWORD PTR [r8+6304] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+160], ymm0 + ; 48..55 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm2, YMMWORD PTR [rdx+1216] + vmovdqu ymm4, YMMWORD PTR [rdx+2240] + vmovdqu ymm6, YMMWORD PTR [r8+192] + vmovdqu ymm8, YMMWORD PTR [r8+1216] + vmovdqu ymm10, YMMWORD PTR [r8+2240] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3264] + vmovdqu ymm4, YMMWORD PTR [rdx+4288] + vmovdqu ymm8, YMMWORD PTR [r8+3264] + vmovdqu ymm10, YMMWORD PTR [r8+4288] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5312] + vmovdqu ymm4, YMMWORD PTR [rdx+6336] + vmovdqu ymm8, YMMWORD PTR [r8+5312] + vmovdqu ymm10, YMMWORD PTR [r8+6336] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm0 + ; 56..63 + vmovdqu ymm0, YMMWORD PTR [rdx+224] + vmovdqu ymm2, YMMWORD PTR [rdx+1248] + vmovdqu ymm4, YMMWORD PTR [rdx+2272] + vmovdqu ymm6, YMMWORD PTR [r8+224] + vmovdqu ymm8, YMMWORD PTR [r8+1248] + vmovdqu ymm10, YMMWORD PTR [r8+2272] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3296] + vmovdqu ymm4, YMMWORD PTR [rdx+4320] + vmovdqu ymm8, YMMWORD PTR [r8+3296] + vmovdqu ymm10, YMMWORD PTR [r8+4320] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5344] + vmovdqu ymm4, YMMWORD PTR [rdx+6368] + vmovdqu ymm8, YMMWORD PTR [r8+5344] + vmovdqu ymm10, YMMWORD PTR [r8+6368] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+224], ymm0 + ; 64..71 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm2, YMMWORD PTR [rdx+1280] + vmovdqu ymm4, YMMWORD PTR [rdx+2304] + vmovdqu ymm6, YMMWORD PTR [r8+256] + vmovdqu ymm8, YMMWORD PTR [r8+1280] + vmovdqu ymm10, YMMWORD PTR [r8+2304] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3328] + vmovdqu ymm4, YMMWORD PTR [rdx+4352] + vmovdqu ymm8, YMMWORD PTR [r8+3328] + vmovdqu ymm10, YMMWORD PTR [r8+4352] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5376] + vmovdqu ymm4, YMMWORD PTR [rdx+6400] + vmovdqu ymm8, YMMWORD PTR [r8+5376] + vmovdqu ymm10, YMMWORD PTR [r8+6400] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + ; 72..79 + vmovdqu ymm0, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+1312] + vmovdqu ymm4, YMMWORD PTR [rdx+2336] + vmovdqu ymm6, YMMWORD PTR [r8+288] + vmovdqu ymm8, YMMWORD PTR [r8+1312] + vmovdqu ymm10, YMMWORD PTR [r8+2336] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3360] + vmovdqu ymm4, YMMWORD PTR [rdx+4384] + vmovdqu ymm8, YMMWORD PTR [r8+3360] + vmovdqu ymm10, YMMWORD PTR [r8+4384] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5408] + vmovdqu ymm4, YMMWORD PTR [rdx+6432] + vmovdqu ymm8, YMMWORD PTR [r8+5408] + vmovdqu ymm10, YMMWORD PTR [r8+6432] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm0 + ; 80..87 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm2, YMMWORD PTR [rdx+1344] + vmovdqu ymm4, YMMWORD PTR [rdx+2368] + vmovdqu ymm6, YMMWORD PTR [r8+320] + vmovdqu ymm8, YMMWORD PTR [r8+1344] + vmovdqu ymm10, YMMWORD PTR [r8+2368] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3392] + vmovdqu ymm4, YMMWORD PTR [rdx+4416] + vmovdqu ymm8, YMMWORD PTR [r8+3392] + vmovdqu ymm10, YMMWORD PTR [r8+4416] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5440] + vmovdqu ymm4, YMMWORD PTR [rdx+6464] + vmovdqu ymm8, YMMWORD PTR [r8+5440] + vmovdqu ymm10, YMMWORD PTR [r8+6464] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm0 + ; 88..95 + vmovdqu ymm0, YMMWORD PTR [rdx+352] + vmovdqu ymm2, YMMWORD PTR [rdx+1376] + vmovdqu ymm4, YMMWORD PTR [rdx+2400] + vmovdqu ymm6, YMMWORD PTR [r8+352] + vmovdqu ymm8, YMMWORD PTR [r8+1376] + vmovdqu ymm10, YMMWORD PTR [r8+2400] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3424] + vmovdqu ymm4, YMMWORD PTR [rdx+4448] + vmovdqu ymm8, YMMWORD PTR [r8+3424] + vmovdqu ymm10, YMMWORD PTR [r8+4448] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5472] + vmovdqu ymm4, YMMWORD PTR [rdx+6496] + vmovdqu ymm8, YMMWORD PTR [r8+5472] + vmovdqu ymm10, YMMWORD PTR [r8+6496] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm0 + ; 96..103 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm2, YMMWORD PTR [rdx+1408] + vmovdqu ymm4, YMMWORD PTR [rdx+2432] + vmovdqu ymm6, YMMWORD PTR [r8+384] + vmovdqu ymm8, YMMWORD PTR [r8+1408] + vmovdqu ymm10, YMMWORD PTR [r8+2432] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3456] + vmovdqu ymm4, YMMWORD PTR [rdx+4480] + vmovdqu ymm8, YMMWORD PTR [r8+3456] + vmovdqu ymm10, YMMWORD PTR [r8+4480] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5504] + vmovdqu ymm4, YMMWORD PTR [rdx+6528] + vmovdqu ymm8, YMMWORD PTR [r8+5504] + vmovdqu ymm10, YMMWORD PTR [r8+6528] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + ; 104..111 + vmovdqu ymm0, YMMWORD PTR [rdx+416] + vmovdqu ymm2, YMMWORD PTR [rdx+1440] + vmovdqu ymm4, YMMWORD PTR [rdx+2464] + vmovdqu ymm6, YMMWORD PTR [r8+416] + vmovdqu ymm8, YMMWORD PTR [r8+1440] + vmovdqu ymm10, YMMWORD PTR [r8+2464] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3488] + vmovdqu ymm4, YMMWORD PTR [rdx+4512] + vmovdqu ymm8, YMMWORD PTR [r8+3488] + vmovdqu ymm10, YMMWORD PTR [r8+4512] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5536] + vmovdqu ymm4, YMMWORD PTR [rdx+6560] + vmovdqu ymm8, YMMWORD PTR [r8+5536] + vmovdqu ymm10, YMMWORD PTR [r8+6560] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+416], ymm0 + ; 112..119 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm2, YMMWORD PTR [rdx+1472] + vmovdqu ymm4, YMMWORD PTR [rdx+2496] + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm8, YMMWORD PTR [r8+1472] + vmovdqu ymm10, YMMWORD PTR [r8+2496] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3520] + vmovdqu ymm4, YMMWORD PTR [rdx+4544] + vmovdqu ymm8, YMMWORD PTR [r8+3520] + vmovdqu ymm10, YMMWORD PTR [r8+4544] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5568] + vmovdqu ymm4, YMMWORD PTR [rdx+6592] + vmovdqu ymm8, YMMWORD PTR [r8+5568] + vmovdqu ymm10, YMMWORD PTR [r8+6592] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm0 + ; 120..127 + vmovdqu ymm0, YMMWORD PTR [rdx+480] + vmovdqu ymm2, YMMWORD PTR [rdx+1504] + vmovdqu ymm4, YMMWORD PTR [rdx+2528] + vmovdqu ymm6, YMMWORD PTR [r8+480] + vmovdqu ymm8, YMMWORD PTR [r8+1504] + vmovdqu ymm10, YMMWORD PTR [r8+2528] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3552] + vmovdqu ymm4, YMMWORD PTR [rdx+4576] + vmovdqu ymm8, YMMWORD PTR [r8+3552] + vmovdqu ymm10, YMMWORD PTR [r8+4576] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5600] + vmovdqu ymm4, YMMWORD PTR [rdx+6624] + vmovdqu ymm8, YMMWORD PTR [r8+5600] + vmovdqu ymm10, YMMWORD PTR [r8+6624] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+480], ymm0 + ; 128..135 + vmovdqu ymm0, YMMWORD PTR [rdx+512] + vmovdqu ymm2, YMMWORD PTR [rdx+1536] + vmovdqu ymm4, YMMWORD PTR [rdx+2560] + vmovdqu ymm6, YMMWORD PTR [r8+512] + vmovdqu ymm8, YMMWORD PTR [r8+1536] + vmovdqu ymm10, YMMWORD PTR [r8+2560] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3584] + vmovdqu ymm4, YMMWORD PTR [rdx+4608] + vmovdqu ymm8, YMMWORD PTR [r8+3584] + vmovdqu ymm10, YMMWORD PTR [r8+4608] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5632] + vmovdqu ymm4, YMMWORD PTR [rdx+6656] + vmovdqu ymm8, YMMWORD PTR [r8+5632] + vmovdqu ymm10, YMMWORD PTR [r8+6656] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + ; 136..143 + vmovdqu ymm0, YMMWORD PTR [rdx+544] + vmovdqu ymm2, YMMWORD PTR [rdx+1568] + vmovdqu ymm4, YMMWORD PTR [rdx+2592] + vmovdqu ymm6, YMMWORD PTR [r8+544] + vmovdqu ymm8, YMMWORD PTR [r8+1568] + vmovdqu ymm10, YMMWORD PTR [r8+2592] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3616] + vmovdqu ymm4, YMMWORD PTR [rdx+4640] + vmovdqu ymm8, YMMWORD PTR [r8+3616] + vmovdqu ymm10, YMMWORD PTR [r8+4640] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5664] + vmovdqu ymm4, YMMWORD PTR [rdx+6688] + vmovdqu ymm8, YMMWORD PTR [r8+5664] + vmovdqu ymm10, YMMWORD PTR [r8+6688] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+544], ymm0 + ; 144..151 + vmovdqu ymm0, YMMWORD PTR [rdx+576] + vmovdqu ymm2, YMMWORD PTR [rdx+1600] + vmovdqu ymm4, YMMWORD PTR [rdx+2624] + vmovdqu ymm6, YMMWORD PTR [r8+576] + vmovdqu ymm8, YMMWORD PTR [r8+1600] + vmovdqu ymm10, YMMWORD PTR [r8+2624] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3648] + vmovdqu ymm4, YMMWORD PTR [rdx+4672] + vmovdqu ymm8, YMMWORD PTR [r8+3648] + vmovdqu ymm10, YMMWORD PTR [r8+4672] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5696] + vmovdqu ymm4, YMMWORD PTR [rdx+6720] + vmovdqu ymm8, YMMWORD PTR [r8+5696] + vmovdqu ymm10, YMMWORD PTR [r8+6720] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm0 + ; 152..159 + vmovdqu ymm0, YMMWORD PTR [rdx+608] + vmovdqu ymm2, YMMWORD PTR [rdx+1632] + vmovdqu ymm4, YMMWORD PTR [rdx+2656] + vmovdqu ymm6, YMMWORD PTR [r8+608] + vmovdqu ymm8, YMMWORD PTR [r8+1632] + vmovdqu ymm10, YMMWORD PTR [r8+2656] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3680] + vmovdqu ymm4, YMMWORD PTR [rdx+4704] + vmovdqu ymm8, YMMWORD PTR [r8+3680] + vmovdqu ymm10, YMMWORD PTR [r8+4704] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5728] + vmovdqu ymm4, YMMWORD PTR [rdx+6752] + vmovdqu ymm8, YMMWORD PTR [r8+5728] + vmovdqu ymm10, YMMWORD PTR [r8+6752] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+608], ymm0 + ; 160..167 + vmovdqu ymm0, YMMWORD PTR [rdx+640] + vmovdqu ymm2, YMMWORD PTR [rdx+1664] + vmovdqu ymm4, YMMWORD PTR [rdx+2688] + vmovdqu ymm6, YMMWORD PTR [r8+640] + vmovdqu ymm8, YMMWORD PTR [r8+1664] + vmovdqu ymm10, YMMWORD PTR [r8+2688] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3712] + vmovdqu ymm4, YMMWORD PTR [rdx+4736] + vmovdqu ymm8, YMMWORD PTR [r8+3712] + vmovdqu ymm10, YMMWORD PTR [r8+4736] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5760] + vmovdqu ymm4, YMMWORD PTR [rdx+6784] + vmovdqu ymm8, YMMWORD PTR [r8+5760] + vmovdqu ymm10, YMMWORD PTR [r8+6784] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+640], ymm0 + ; 168..175 + vmovdqu ymm0, YMMWORD PTR [rdx+672] + vmovdqu ymm2, YMMWORD PTR [rdx+1696] + vmovdqu ymm4, YMMWORD PTR [rdx+2720] + vmovdqu ymm6, YMMWORD PTR [r8+672] + vmovdqu ymm8, YMMWORD PTR [r8+1696] + vmovdqu ymm10, YMMWORD PTR [r8+2720] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3744] + vmovdqu ymm4, YMMWORD PTR [rdx+4768] + vmovdqu ymm8, YMMWORD PTR [r8+3744] + vmovdqu ymm10, YMMWORD PTR [r8+4768] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5792] + vmovdqu ymm4, YMMWORD PTR [rdx+6816] + vmovdqu ymm8, YMMWORD PTR [r8+5792] + vmovdqu ymm10, YMMWORD PTR [r8+6816] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+672], ymm0 + ; 176..183 + vmovdqu ymm0, YMMWORD PTR [rdx+704] + vmovdqu ymm2, YMMWORD PTR [rdx+1728] + vmovdqu ymm4, YMMWORD PTR [rdx+2752] + vmovdqu ymm6, YMMWORD PTR [r8+704] + vmovdqu ymm8, YMMWORD PTR [r8+1728] + vmovdqu ymm10, YMMWORD PTR [r8+2752] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3776] + vmovdqu ymm4, YMMWORD PTR [rdx+4800] + vmovdqu ymm8, YMMWORD PTR [r8+3776] + vmovdqu ymm10, YMMWORD PTR [r8+4800] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5824] + vmovdqu ymm4, YMMWORD PTR [rdx+6848] + vmovdqu ymm8, YMMWORD PTR [r8+5824] + vmovdqu ymm10, YMMWORD PTR [r8+6848] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+704], ymm0 + ; 184..191 + vmovdqu ymm0, YMMWORD PTR [rdx+736] + vmovdqu ymm2, YMMWORD PTR [rdx+1760] + vmovdqu ymm4, YMMWORD PTR [rdx+2784] + vmovdqu ymm6, YMMWORD PTR [r8+736] + vmovdqu ymm8, YMMWORD PTR [r8+1760] + vmovdqu ymm10, YMMWORD PTR [r8+2784] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3808] + vmovdqu ymm4, YMMWORD PTR [rdx+4832] + vmovdqu ymm8, YMMWORD PTR [r8+3808] + vmovdqu ymm10, YMMWORD PTR [r8+4832] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5856] + vmovdqu ymm4, YMMWORD PTR [rdx+6880] + vmovdqu ymm8, YMMWORD PTR [r8+5856] + vmovdqu ymm10, YMMWORD PTR [r8+6880] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+736], ymm0 + ; 192..199 + vmovdqu ymm0, YMMWORD PTR [rdx+768] + vmovdqu ymm2, YMMWORD PTR [rdx+1792] + vmovdqu ymm4, YMMWORD PTR [rdx+2816] + vmovdqu ymm6, YMMWORD PTR [r8+768] + vmovdqu ymm8, YMMWORD PTR [r8+1792] + vmovdqu ymm10, YMMWORD PTR [r8+2816] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3840] + vmovdqu ymm4, YMMWORD PTR [rdx+4864] + vmovdqu ymm8, YMMWORD PTR [r8+3840] + vmovdqu ymm10, YMMWORD PTR [r8+4864] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5888] + vmovdqu ymm4, YMMWORD PTR [rdx+6912] + vmovdqu ymm8, YMMWORD PTR [r8+5888] + vmovdqu ymm10, YMMWORD PTR [r8+6912] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + ; 200..207 + vmovdqu ymm0, YMMWORD PTR [rdx+800] + vmovdqu ymm2, YMMWORD PTR [rdx+1824] + vmovdqu ymm4, YMMWORD PTR [rdx+2848] + vmovdqu ymm6, YMMWORD PTR [r8+800] + vmovdqu ymm8, YMMWORD PTR [r8+1824] + vmovdqu ymm10, YMMWORD PTR [r8+2848] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3872] + vmovdqu ymm4, YMMWORD PTR [rdx+4896] + vmovdqu ymm8, YMMWORD PTR [r8+3872] + vmovdqu ymm10, YMMWORD PTR [r8+4896] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5920] + vmovdqu ymm4, YMMWORD PTR [rdx+6944] + vmovdqu ymm8, YMMWORD PTR [r8+5920] + vmovdqu ymm10, YMMWORD PTR [r8+6944] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+800], ymm0 + ; 208..215 + vmovdqu ymm0, YMMWORD PTR [rdx+832] + vmovdqu ymm2, YMMWORD PTR [rdx+1856] + vmovdqu ymm4, YMMWORD PTR [rdx+2880] + vmovdqu ymm6, YMMWORD PTR [r8+832] + vmovdqu ymm8, YMMWORD PTR [r8+1856] + vmovdqu ymm10, YMMWORD PTR [r8+2880] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3904] + vmovdqu ymm4, YMMWORD PTR [rdx+4928] + vmovdqu ymm8, YMMWORD PTR [r8+3904] + vmovdqu ymm10, YMMWORD PTR [r8+4928] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5952] + vmovdqu ymm4, YMMWORD PTR [rdx+6976] + vmovdqu ymm8, YMMWORD PTR [r8+5952] + vmovdqu ymm10, YMMWORD PTR [r8+6976] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm0 + ; 216..223 + vmovdqu ymm0, YMMWORD PTR [rdx+864] + vmovdqu ymm2, YMMWORD PTR [rdx+1888] + vmovdqu ymm4, YMMWORD PTR [rdx+2912] + vmovdqu ymm6, YMMWORD PTR [r8+864] + vmovdqu ymm8, YMMWORD PTR [r8+1888] + vmovdqu ymm10, YMMWORD PTR [r8+2912] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3936] + vmovdqu ymm4, YMMWORD PTR [rdx+4960] + vmovdqu ymm8, YMMWORD PTR [r8+3936] + vmovdqu ymm10, YMMWORD PTR [r8+4960] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+5984] + vmovdqu ymm4, YMMWORD PTR [rdx+7008] + vmovdqu ymm8, YMMWORD PTR [r8+5984] + vmovdqu ymm10, YMMWORD PTR [r8+7008] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+864], ymm0 + ; 224..231 + vmovdqu ymm0, YMMWORD PTR [rdx+896] + vmovdqu ymm2, YMMWORD PTR [rdx+1920] + vmovdqu ymm4, YMMWORD PTR [rdx+2944] + vmovdqu ymm6, YMMWORD PTR [r8+896] + vmovdqu ymm8, YMMWORD PTR [r8+1920] + vmovdqu ymm10, YMMWORD PTR [r8+2944] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+3968] + vmovdqu ymm4, YMMWORD PTR [rdx+4992] + vmovdqu ymm8, YMMWORD PTR [r8+3968] + vmovdqu ymm10, YMMWORD PTR [r8+4992] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+6016] + vmovdqu ymm4, YMMWORD PTR [rdx+7040] + vmovdqu ymm8, YMMWORD PTR [r8+6016] + vmovdqu ymm10, YMMWORD PTR [r8+7040] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+896], ymm0 + ; 232..239 + vmovdqu ymm0, YMMWORD PTR [rdx+928] + vmovdqu ymm2, YMMWORD PTR [rdx+1952] + vmovdqu ymm4, YMMWORD PTR [rdx+2976] + vmovdqu ymm6, YMMWORD PTR [r8+928] + vmovdqu ymm8, YMMWORD PTR [r8+1952] + vmovdqu ymm10, YMMWORD PTR [r8+2976] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+4000] + vmovdqu ymm4, YMMWORD PTR [rdx+5024] + vmovdqu ymm8, YMMWORD PTR [r8+4000] + vmovdqu ymm10, YMMWORD PTR [r8+5024] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+6048] + vmovdqu ymm4, YMMWORD PTR [rdx+7072] + vmovdqu ymm8, YMMWORD PTR [r8+6048] + vmovdqu ymm10, YMMWORD PTR [r8+7072] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+928], ymm0 + ; 240..247 + vmovdqu ymm0, YMMWORD PTR [rdx+960] + vmovdqu ymm2, YMMWORD PTR [rdx+1984] + vmovdqu ymm4, YMMWORD PTR [rdx+3008] + vmovdqu ymm6, YMMWORD PTR [r8+960] + vmovdqu ymm8, YMMWORD PTR [r8+1984] + vmovdqu ymm10, YMMWORD PTR [r8+3008] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+4032] + vmovdqu ymm4, YMMWORD PTR [rdx+5056] + vmovdqu ymm8, YMMWORD PTR [r8+4032] + vmovdqu ymm10, YMMWORD PTR [r8+5056] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+6080] + vmovdqu ymm4, YMMWORD PTR [rdx+7104] + vmovdqu ymm8, YMMWORD PTR [r8+6080] + vmovdqu ymm10, YMMWORD PTR [r8+7104] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+960], ymm0 + ; 248..255 + vmovdqu ymm0, YMMWORD PTR [rdx+992] + vmovdqu ymm2, YMMWORD PTR [rdx+2016] + vmovdqu ymm4, YMMWORD PTR [rdx+3040] + vmovdqu ymm6, YMMWORD PTR [r8+992] + vmovdqu ymm8, YMMWORD PTR [r8+2016] + vmovdqu ymm10, YMMWORD PTR [r8+3040] + vpshufd ymm1, ymm0, 245 + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm7, ymm6, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm0, ymm0, ymm6 + vpmuldq ymm1, ymm1, ymm7 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+4064] + vmovdqu ymm4, YMMWORD PTR [rdx+5088] + vmovdqu ymm8, YMMWORD PTR [r8+4064] + vmovdqu ymm10, YMMWORD PTR [r8+5088] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + vmovdqu ymm2, YMMWORD PTR [rdx+6112] + vmovdqu ymm4, YMMWORD PTR [rdx+7136] + vmovdqu ymm8, YMMWORD PTR [r8+6112] + vmovdqu ymm10, YMMWORD PTR [r8+7136] + vpshufd ymm3, ymm2, 245 + vpshufd ymm5, ymm4, 245 + vpshufd ymm9, ymm8, 245 + vpshufd ymm11, ymm10, 245 + vpmuldq ymm2, ymm2, ymm8 + vpmuldq ymm3, ymm3, ymm9 + vpmuldq ymm4, ymm4, ymm10 + vpmuldq ymm5, ymm5, ymm11 + vpaddq ymm0, ymm0, ymm2 + vpaddq ymm1, ymm1, ymm3 + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm5 + ; Mont Reduce 2 + vpmulld ymm6, ymm0, ymm13 + vpmulld ymm7, ymm1, ymm13 + vpmuldq ymm6, ymm6, ymm12 + vpmuldq ymm7, ymm7, ymm12 + vpsubd ymm0, ymm0, ymm6 + vpsubd ymm1, ymm1, ymm7 + vpsrlq ymm0, ymm0, 32 + vpor ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+992], ymm0 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +wc_mldsa_mul_vec_7_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_rej_idx QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000002h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000000000002h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000003h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000000000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000002h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000000000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000000000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000000000004h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000002h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000003h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000002h, 0000000000000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000000000004h, 0000000000000000h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000002h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000002h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000003h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000002h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000500000004h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000001h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000002h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000400000003h, 0000000000000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000500000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000500000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000300000002h, 0000000500000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000000000005h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000500000004h, 0000000000000000h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000600000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000600000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000600000002h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000002h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000600000003h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000002h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000600000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000600000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000600000004h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000001h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000002h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000600000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000600000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000400000003h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000600000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000600000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000300000002h, 0000000600000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000600000004h, 0000000000000000h + QWORD 0000000600000005h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000000h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000001h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000002h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000002h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000500000003h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000003h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000300000002h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000003h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000003h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000600000005h, 0000000000000000h + QWORD 0000000500000004h, 0000000000000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000001h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000004h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000400000002h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000004h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000004h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000600000005h, 0000000000000000h + QWORD 0000000400000003h, 0000000600000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000500000004h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000300000001h, 0000000500000004h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000600000005h, 0000000000000000h + QWORD 0000000300000002h, 0000000500000004h + QWORD 0000000000000006h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000600000005h, 0000000000000000h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000600000005h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000500000004h, 0000000000000006h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000700000000h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000700000001h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000700000002h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000700000002h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000700000003h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000700000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000002h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000700000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000700000003h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000700000004h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000001h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000700000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000002h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000700000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000700000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000400000003h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000700000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000700000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000300000002h, 0000000700000004h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000700000004h, 0000000000000000h + QWORD 0000000700000005h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000000h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000001h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000002h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000002h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000500000003h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000300000002h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000700000005h, 0000000000000000h + QWORD 0000000500000004h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000001h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000400000002h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000700000005h, 0000000000000000h + QWORD 0000000400000003h, 0000000700000005h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000500000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000300000001h, 0000000500000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000700000005h, 0000000000000000h + QWORD 0000000300000002h, 0000000500000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000700000005h, 0000000000000000h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000700000005h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000500000004h, 0000000000000007h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000600000000h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000600000001h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000600000002h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000001h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000002h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000600000003h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000001h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000300000002h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000600000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000001h, 0000000600000003h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000600000004h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000001h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000400000002h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000600000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000001h, 0000000600000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000400000003h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000600000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000300000001h, 0000000600000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000300000002h, 0000000600000004h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000600000004h, 0000000000000007h + QWORD 0000000600000005h, 0000000000000007h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000000h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000500000001h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000100000000h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000500000002h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000200000000h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000001h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000002h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000500000003h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000300000000h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000300000001h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000003h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000300000002h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000003h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000003h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000600000005h, 0000000000000007h + QWORD 0000000500000004h, 0000000700000006h + QWORD 0000000000000000h, 0000000000000000h + QWORD 0000000400000000h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000400000001h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000100000000h, 0000000500000004h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000400000002h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000200000000h, 0000000500000004h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000200000001h, 0000000500000004h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000002h + QWORD 0000000600000005h, 0000000000000007h + QWORD 0000000400000003h, 0000000600000005h + QWORD 0000000000000007h, 0000000000000000h + QWORD 0000000300000000h, 0000000500000004h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000300000001h, 0000000500000004h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000100000000h, 0000000400000003h + QWORD 0000000600000005h, 0000000000000007h + QWORD 0000000300000002h, 0000000500000004h + QWORD 0000000700000006h, 0000000000000000h + QWORD 0000000200000000h, 0000000400000003h + QWORD 0000000600000005h, 0000000000000007h + QWORD 0000000200000001h, 0000000400000003h + QWORD 0000000600000005h, 0000000000000007h + QWORD 0000000100000000h, 0000000300000002h + QWORD 0000000500000004h, 0000000700000006h +ptr_L_mldsa_rej_idx QWORD L_mldsa_rej_idx +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_rej_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h + DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h +ptr_L_mldsa_rej_q QWORD L_mldsa_rej_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_rej_mask QWORD 007fffff007fffffh, 007fffff007fffffh + QWORD 007fffff007fffffh, 007fffff007fffffh +ptr_L_mldsa_rej_mask QWORD L_mldsa_rej_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_rej_shuffle QWORD 0005040300020100h, 000b0a0900080706h + QWORD 0009080700060504h, 000f0e0d000c0b0ah +ptr_L_mldsa_rej_shuffle QWORD L_mldsa_rej_shuffle +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_rej_ones QWORD 0101010101010101h, 0101010101010101h + QWORD 0101010101010101h, 0101010101010101h +ptr_L_mldsa_rej_ones QWORD L_mldsa_rej_ones +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_rej_uniform_n_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + mov r10, r9 + mov r9, r8 + mov r8, rdx + mov rdx, rcx + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + mov eax, r8d + vmovdqu ymm6, YMMWORD PTR L_mldsa_rej_q + vmovdqu ymm7, YMMWORD PTR L_mldsa_rej_mask + vmovdqu ymm8, YMMWORD PTR L_mldsa_rej_shuffle + vmovdqu ymm9, YMMWORD PTR L_mldsa_rej_ones + mov r11, QWORD PTR [ptr_L_mldsa_rej_idx] + vpermq ymm0, [r9], 148 + vpermq ymm1, [r9+24], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+48], 148 + vpermq ymm1, [r9+72], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+96], 148 + vpermq ymm1, [r9+120], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+144], 148 + vpermq ymm1, [r9+168], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+192], 148 + vpermq ymm1, [r9+216], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+240], 148 + vpermq ymm1, [r9+264], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+288], 148 + vpermq ymm1, [r9+312], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+336], 148 + vpermq ymm1, [r9+360], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+384], 148 + vpermq ymm1, [r9+408], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+432], 148 + vpermq ymm1, [r9+456], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+480], 148 + vpermq ymm1, [r9+504], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+528], 148 + vpermq ymm1, [r9+552], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+576], 148 + vpermq ymm1, [r9+600], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+624], 148 + vpermq ymm1, [r9+648], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + vpermq ymm0, [r9+672], 148 + vpermq ymm1, [r9+696], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + add r9, 720 + sub r10d, 720 +L_mldsa_rej_uniform_n_avx2_start_256: + vpermq ymm0, [r9], 148 + vpermq ymm1, [r9+24], 148 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpand ymm0, ymm0, ymm7 + vpand ymm1, ymm1, ymm7 + vpcmpgtd ymm2, ymm6, ymm0 + vpcmpgtd ymm3, ymm6, ymm1 + vpackssdw ymm2, ymm2, ymm3 + vpermq ymm2, ymm2, 216 + vpacksswb ymm2, ymm2, ymm2 + vpermq ymm2, ymm2, 216 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + shl r12d, 5 + shl ecx, 5 + vmovdqu ymm2, YMMWORD PTR [r11+r12] + vmovdqu ymm3, YMMWORD PTR [r11+rcx] + vpermd ymm0, ymm2, ymm0 + vpermd ymm1, ymm3, ymm1 + popcnt r12d, r12d + popcnt ecx, ecx + vmovdqu YMMWORD PTR [rdx], ymm0 + lea rdx, QWORD PTR [rdx+4*r12] + sub r8d, r12d + vmovdqu YMMWORD PTR [rdx], ymm1 + lea rdx, QWORD PTR [rdx+4*rcx] + sub r8d, ecx + add r9, 48 + sub r10d, 48 + cmp r10d, 48 + jl L_mldsa_rej_uniform_n_avx2_done_256 + cmp r8d, 16 + jge L_mldsa_rej_uniform_n_avx2_start_256 +L_mldsa_rej_uniform_n_avx2_done_256: + cmp r8d, 0 + je L_mldsa_rej_uniform_n_avx2_done_64 + mov rdi, 72057589759737855 + mov r15, 36028792732385279 +L_mldsa_rej_uniform_n_avx2_start_64: + mov rcx, QWORD PTR [r9] + pdep rcx, rcx, rdi + and rcx, r15 + cmp ecx, 8380417 + jge L_mldsa_rej_uniform_0_avx2_rej_large_0 + mov DWORD PTR [rdx], ecx + add rdx, 4 + sub r8d, 1 + je L_mldsa_rej_uniform_n_avx2_done_64 +L_mldsa_rej_uniform_0_avx2_rej_large_0: + shr rcx, 32 + cmp ecx, 8380417 + jge L_mldsa_rej_uniform_0_avx2_rej_large_1 + mov DWORD PTR [rdx], ecx + add rdx, 4 + sub r8d, 1 + je L_mldsa_rej_uniform_n_avx2_done_64 +L_mldsa_rej_uniform_0_avx2_rej_large_1: + add r9, 6 + sub r10d, 6 + jle L_mldsa_rej_uniform_n_avx2_done_64 + cmp r8d, 0 + jg L_mldsa_rej_uniform_n_avx2_start_64 +L_mldsa_rej_uniform_n_avx2_done_64: + vzeroupper + sub eax, r8d + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +wc_mldsa_rej_uniform_n_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_rej_uniform_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + mov r10, r9 + mov r9, r8 + mov r8, rdx + mov rdx, rcx + mov eax, r8d + mov rdi, 72057589759737855 + mov r15, 36028792732385279 +L_mldsa_rej_uniform_avx2_start_64: + mov rcx, QWORD PTR [r9] + pdep rcx, rcx, rdi + and rcx, r15 + cmp ecx, 8380417 + jge L_mldsa_rej_uniform_avx2_rej_large_0 + mov DWORD PTR [rdx], ecx + add rdx, 4 + sub r8d, 1 + je L_mldsa_rej_uniform_avx2_done_64 +L_mldsa_rej_uniform_avx2_rej_large_0: + shr rcx, 32 + cmp ecx, 8380417 + jge L_mldsa_rej_uniform_avx2_rej_large_1 + mov DWORD PTR [rdx], ecx + add rdx, 4 + sub r8d, 1 + je L_mldsa_rej_uniform_avx2_done_64 +L_mldsa_rej_uniform_avx2_rej_large_1: + add r9, 6 + sub r10d, 6 + jle L_mldsa_rej_uniform_avx2_done_64 +L_mldsa_rej_uniform_avx2_done_64: + sub eax, r8d + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +wc_mldsa_rej_uniform_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_shufb_rej_idx QWORD 0ffffffffffffffffh, 0ffffffffffffffffh + QWORD 0ffffffffffff0100h, 0ffffffffffffffffh + QWORD 0ffffffffffff0302h, 0ffffffffffffffffh + QWORD 0ffffffff03020100h, 0ffffffffffffffffh + QWORD 0ffffffffffff0504h, 0ffffffffffffffffh + QWORD 0ffffffff05040100h, 0ffffffffffffffffh + QWORD 0ffffffff05040302h, 0ffffffffffffffffh + QWORD 0ffff050403020100h, 0ffffffffffffffffh + QWORD 0ffffffffffff0706h, 0ffffffffffffffffh + QWORD 0ffffffff07060100h, 0ffffffffffffffffh + QWORD 0ffffffff07060302h, 0ffffffffffffffffh + QWORD 0ffff070603020100h, 0ffffffffffffffffh + QWORD 0ffffffff07060504h, 0ffffffffffffffffh + QWORD 0ffff070605040100h, 0ffffffffffffffffh + QWORD 0ffff070605040302h, 0ffffffffffffffffh + QWORD 0706050403020100h, 0ffffffffffffffffh + QWORD 0ffffffffffff0908h, 0ffffffffffffffffh + QWORD 0ffffffff09080100h, 0ffffffffffffffffh + QWORD 0ffffffff09080302h, 0ffffffffffffffffh + QWORD 0ffff090803020100h, 0ffffffffffffffffh + QWORD 0ffffffff09080504h, 0ffffffffffffffffh + QWORD 0ffff090805040100h, 0ffffffffffffffffh + QWORD 0ffff090805040302h, 0ffffffffffffffffh + QWORD 0908050403020100h, 0ffffffffffffffffh + QWORD 0ffffffff09080706h, 0ffffffffffffffffh + QWORD 0ffff090807060100h, 0ffffffffffffffffh + QWORD 0ffff090807060302h, 0ffffffffffffffffh + QWORD 0908070603020100h, 0ffffffffffffffffh + QWORD 0ffff090807060504h, 0ffffffffffffffffh + QWORD 0908070605040100h, 0ffffffffffffffffh + QWORD 0908070605040302h, 0ffffffffffffffffh + QWORD 0706050403020100h, 0ffffffffffff0908h + QWORD 0ffffffffffff0b0ah, 0ffffffffffffffffh + QWORD 0ffffffff0b0a0100h, 0ffffffffffffffffh + QWORD 0ffffffff0b0a0302h, 0ffffffffffffffffh + QWORD 0ffff0b0a03020100h, 0ffffffffffffffffh + QWORD 0ffffffff0b0a0504h, 0ffffffffffffffffh + QWORD 0ffff0b0a05040100h, 0ffffffffffffffffh + QWORD 0ffff0b0a05040302h, 0ffffffffffffffffh + QWORD 0b0a050403020100h, 0ffffffffffffffffh + QWORD 0ffffffff0b0a0706h, 0ffffffffffffffffh + QWORD 0ffff0b0a07060100h, 0ffffffffffffffffh + QWORD 0ffff0b0a07060302h, 0ffffffffffffffffh + QWORD 0b0a070603020100h, 0ffffffffffffffffh + QWORD 0ffff0b0a07060504h, 0ffffffffffffffffh + QWORD 0b0a070605040100h, 0ffffffffffffffffh + QWORD 0b0a070605040302h, 0ffffffffffffffffh + QWORD 0706050403020100h, 0ffffffffffff0b0ah + QWORD 0ffffffff0b0a0908h, 0ffffffffffffffffh + QWORD 0ffff0b0a09080100h, 0ffffffffffffffffh + QWORD 0ffff0b0a09080302h, 0ffffffffffffffffh + QWORD 0b0a090803020100h, 0ffffffffffffffffh + QWORD 0ffff0b0a09080504h, 0ffffffffffffffffh + QWORD 0b0a090805040100h, 0ffffffffffffffffh + QWORD 0b0a090805040302h, 0ffffffffffffffffh + QWORD 0908050403020100h, 0ffffffffffff0b0ah + QWORD 0ffff0b0a09080706h, 0ffffffffffffffffh + QWORD 0b0a090807060100h, 0ffffffffffffffffh + QWORD 0b0a090807060302h, 0ffffffffffffffffh + QWORD 0908070603020100h, 0ffffffffffff0b0ah + QWORD 0b0a090807060504h, 0ffffffffffffffffh + QWORD 0908070605040100h, 0ffffffffffff0b0ah + QWORD 0908070605040302h, 0ffffffffffff0b0ah + QWORD 0706050403020100h, 0ffffffff0b0a0908h + QWORD 0ffffffffffff0d0ch, 0ffffffffffffffffh + QWORD 0ffffffff0d0c0100h, 0ffffffffffffffffh + QWORD 0ffffffff0d0c0302h, 0ffffffffffffffffh + QWORD 0ffff0d0c03020100h, 0ffffffffffffffffh + QWORD 0ffffffff0d0c0504h, 0ffffffffffffffffh + QWORD 0ffff0d0c05040100h, 0ffffffffffffffffh + QWORD 0ffff0d0c05040302h, 0ffffffffffffffffh + QWORD 0d0c050403020100h, 0ffffffffffffffffh + QWORD 0ffffffff0d0c0706h, 0ffffffffffffffffh + QWORD 0ffff0d0c07060100h, 0ffffffffffffffffh + QWORD 0ffff0d0c07060302h, 0ffffffffffffffffh + QWORD 0d0c070603020100h, 0ffffffffffffffffh + QWORD 0ffff0d0c07060504h, 0ffffffffffffffffh + QWORD 0d0c070605040100h, 0ffffffffffffffffh + QWORD 0d0c070605040302h, 0ffffffffffffffffh + QWORD 0706050403020100h, 0ffffffffffff0d0ch + QWORD 0ffffffff0d0c0908h, 0ffffffffffffffffh + QWORD 0ffff0d0c09080100h, 0ffffffffffffffffh + QWORD 0ffff0d0c09080302h, 0ffffffffffffffffh + QWORD 0d0c090803020100h, 0ffffffffffffffffh + QWORD 0ffff0d0c09080504h, 0ffffffffffffffffh + QWORD 0d0c090805040100h, 0ffffffffffffffffh + QWORD 0d0c090805040302h, 0ffffffffffffffffh + QWORD 0908050403020100h, 0ffffffffffff0d0ch + QWORD 0ffff0d0c09080706h, 0ffffffffffffffffh + QWORD 0d0c090807060100h, 0ffffffffffffffffh + QWORD 0d0c090807060302h, 0ffffffffffffffffh + QWORD 0908070603020100h, 0ffffffffffff0d0ch + QWORD 0d0c090807060504h, 0ffffffffffffffffh + QWORD 0908070605040100h, 0ffffffffffff0d0ch + QWORD 0908070605040302h, 0ffffffffffff0d0ch + QWORD 0706050403020100h, 0ffffffff0d0c0908h + QWORD 0ffffffff0d0c0b0ah, 0ffffffffffffffffh + QWORD 0ffff0d0c0b0a0100h, 0ffffffffffffffffh + QWORD 0ffff0d0c0b0a0302h, 0ffffffffffffffffh + QWORD 0d0c0b0a03020100h, 0ffffffffffffffffh + QWORD 0ffff0d0c0b0a0504h, 0ffffffffffffffffh + QWORD 0d0c0b0a05040100h, 0ffffffffffffffffh + QWORD 0d0c0b0a05040302h, 0ffffffffffffffffh + QWORD 0b0a050403020100h, 0ffffffffffff0d0ch + QWORD 0ffff0d0c0b0a0706h, 0ffffffffffffffffh + QWORD 0d0c0b0a07060100h, 0ffffffffffffffffh + QWORD 0d0c0b0a07060302h, 0ffffffffffffffffh + QWORD 0b0a070603020100h, 0ffffffffffff0d0ch + QWORD 0d0c0b0a07060504h, 0ffffffffffffffffh + QWORD 0b0a070605040100h, 0ffffffffffff0d0ch + QWORD 0b0a070605040302h, 0ffffffffffff0d0ch + QWORD 0706050403020100h, 0ffffffff0d0c0b0ah + QWORD 0ffff0d0c0b0a0908h, 0ffffffffffffffffh + QWORD 0d0c0b0a09080100h, 0ffffffffffffffffh + QWORD 0d0c0b0a09080302h, 0ffffffffffffffffh + QWORD 0b0a090803020100h, 0ffffffffffff0d0ch + QWORD 0d0c0b0a09080504h, 0ffffffffffffffffh + QWORD 0b0a090805040100h, 0ffffffffffff0d0ch + QWORD 0b0a090805040302h, 0ffffffffffff0d0ch + QWORD 0908050403020100h, 0ffffffff0d0c0b0ah + QWORD 0d0c0b0a09080706h, 0ffffffffffffffffh + QWORD 0b0a090807060100h, 0ffffffffffff0d0ch + QWORD 0b0a090807060302h, 0ffffffffffff0d0ch + QWORD 0908070603020100h, 0ffffffff0d0c0b0ah + QWORD 0b0a090807060504h, 0ffffffffffff0d0ch + QWORD 0908070605040100h, 0ffffffff0d0c0b0ah + QWORD 0908070605040302h, 0ffffffff0d0c0b0ah + QWORD 0706050403020100h, 0ffff0d0c0b0a0908h + QWORD 0ffffffffffff0f0eh, 0ffffffffffffffffh + QWORD 0ffffffff0f0e0100h, 0ffffffffffffffffh + QWORD 0ffffffff0f0e0302h, 0ffffffffffffffffh + QWORD 0ffff0f0e03020100h, 0ffffffffffffffffh + QWORD 0ffffffff0f0e0504h, 0ffffffffffffffffh + QWORD 0ffff0f0e05040100h, 0ffffffffffffffffh + QWORD 0ffff0f0e05040302h, 0ffffffffffffffffh + QWORD 0f0e050403020100h, 0ffffffffffffffffh + QWORD 0ffffffff0f0e0706h, 0ffffffffffffffffh + QWORD 0ffff0f0e07060100h, 0ffffffffffffffffh + QWORD 0ffff0f0e07060302h, 0ffffffffffffffffh + QWORD 0f0e070603020100h, 0ffffffffffffffffh + QWORD 0ffff0f0e07060504h, 0ffffffffffffffffh + QWORD 0f0e070605040100h, 0ffffffffffffffffh + QWORD 0f0e070605040302h, 0ffffffffffffffffh + QWORD 0706050403020100h, 0ffffffffffff0f0eh + QWORD 0ffffffff0f0e0908h, 0ffffffffffffffffh + QWORD 0ffff0f0e09080100h, 0ffffffffffffffffh + QWORD 0ffff0f0e09080302h, 0ffffffffffffffffh + QWORD 0f0e090803020100h, 0ffffffffffffffffh + QWORD 0ffff0f0e09080504h, 0ffffffffffffffffh + QWORD 0f0e090805040100h, 0ffffffffffffffffh + QWORD 0f0e090805040302h, 0ffffffffffffffffh + QWORD 0908050403020100h, 0ffffffffffff0f0eh + QWORD 0ffff0f0e09080706h, 0ffffffffffffffffh + QWORD 0f0e090807060100h, 0ffffffffffffffffh + QWORD 0f0e090807060302h, 0ffffffffffffffffh + QWORD 0908070603020100h, 0ffffffffffff0f0eh + QWORD 0f0e090807060504h, 0ffffffffffffffffh + QWORD 0908070605040100h, 0ffffffffffff0f0eh + QWORD 0908070605040302h, 0ffffffffffff0f0eh + QWORD 0706050403020100h, 0ffffffff0f0e0908h + QWORD 0ffffffff0f0e0b0ah, 0ffffffffffffffffh + QWORD 0ffff0f0e0b0a0100h, 0ffffffffffffffffh + QWORD 0ffff0f0e0b0a0302h, 0ffffffffffffffffh + QWORD 0f0e0b0a03020100h, 0ffffffffffffffffh + QWORD 0ffff0f0e0b0a0504h, 0ffffffffffffffffh + QWORD 0f0e0b0a05040100h, 0ffffffffffffffffh + QWORD 0f0e0b0a05040302h, 0ffffffffffffffffh + QWORD 0b0a050403020100h, 0ffffffffffff0f0eh + QWORD 0ffff0f0e0b0a0706h, 0ffffffffffffffffh + QWORD 0f0e0b0a07060100h, 0ffffffffffffffffh + QWORD 0f0e0b0a07060302h, 0ffffffffffffffffh + QWORD 0b0a070603020100h, 0ffffffffffff0f0eh + QWORD 0f0e0b0a07060504h, 0ffffffffffffffffh + QWORD 0b0a070605040100h, 0ffffffffffff0f0eh + QWORD 0b0a070605040302h, 0ffffffffffff0f0eh + QWORD 0706050403020100h, 0ffffffff0f0e0b0ah + QWORD 0ffff0f0e0b0a0908h, 0ffffffffffffffffh + QWORD 0f0e0b0a09080100h, 0ffffffffffffffffh + QWORD 0f0e0b0a09080302h, 0ffffffffffffffffh + QWORD 0b0a090803020100h, 0ffffffffffff0f0eh + QWORD 0f0e0b0a09080504h, 0ffffffffffffffffh + QWORD 0b0a090805040100h, 0ffffffffffff0f0eh + QWORD 0b0a090805040302h, 0ffffffffffff0f0eh + QWORD 0908050403020100h, 0ffffffff0f0e0b0ah + QWORD 0f0e0b0a09080706h, 0ffffffffffffffffh + QWORD 0b0a090807060100h, 0ffffffffffff0f0eh + QWORD 0b0a090807060302h, 0ffffffffffff0f0eh + QWORD 0908070603020100h, 0ffffffff0f0e0b0ah + QWORD 0b0a090807060504h, 0ffffffffffff0f0eh + QWORD 0908070605040100h, 0ffffffff0f0e0b0ah + QWORD 0908070605040302h, 0ffffffff0f0e0b0ah + QWORD 0706050403020100h, 0ffff0f0e0b0a0908h + QWORD 0ffffffff0f0e0d0ch, 0ffffffffffffffffh + QWORD 0ffff0f0e0d0c0100h, 0ffffffffffffffffh + QWORD 0ffff0f0e0d0c0302h, 0ffffffffffffffffh + QWORD 0f0e0d0c03020100h, 0ffffffffffffffffh + QWORD 0ffff0f0e0d0c0504h, 0ffffffffffffffffh + QWORD 0f0e0d0c05040100h, 0ffffffffffffffffh + QWORD 0f0e0d0c05040302h, 0ffffffffffffffffh + QWORD 0d0c050403020100h, 0ffffffffffff0f0eh + QWORD 0ffff0f0e0d0c0706h, 0ffffffffffffffffh + QWORD 0f0e0d0c07060100h, 0ffffffffffffffffh + QWORD 0f0e0d0c07060302h, 0ffffffffffffffffh + QWORD 0d0c070603020100h, 0ffffffffffff0f0eh + QWORD 0f0e0d0c07060504h, 0ffffffffffffffffh + QWORD 0d0c070605040100h, 0ffffffffffff0f0eh + QWORD 0d0c070605040302h, 0ffffffffffff0f0eh + QWORD 0706050403020100h, 0ffffffff0f0e0d0ch + QWORD 0ffff0f0e0d0c0908h, 0ffffffffffffffffh + QWORD 0f0e0d0c09080100h, 0ffffffffffffffffh + QWORD 0f0e0d0c09080302h, 0ffffffffffffffffh + QWORD 0d0c090803020100h, 0ffffffffffff0f0eh + QWORD 0f0e0d0c09080504h, 0ffffffffffffffffh + QWORD 0d0c090805040100h, 0ffffffffffff0f0eh + QWORD 0d0c090805040302h, 0ffffffffffff0f0eh + QWORD 0908050403020100h, 0ffffffff0f0e0d0ch + QWORD 0f0e0d0c09080706h, 0ffffffffffffffffh + QWORD 0d0c090807060100h, 0ffffffffffff0f0eh + QWORD 0d0c090807060302h, 0ffffffffffff0f0eh + QWORD 0908070603020100h, 0ffffffff0f0e0d0ch + QWORD 0d0c090807060504h, 0ffffffffffff0f0eh + QWORD 0908070605040100h, 0ffffffff0f0e0d0ch + QWORD 0908070605040302h, 0ffffffff0f0e0d0ch + QWORD 0706050403020100h, 0ffff0f0e0d0c0908h + QWORD 0ffff0f0e0d0c0b0ah, 0ffffffffffffffffh + QWORD 0f0e0d0c0b0a0100h, 0ffffffffffffffffh + QWORD 0f0e0d0c0b0a0302h, 0ffffffffffffffffh + QWORD 0d0c0b0a03020100h, 0ffffffffffff0f0eh + QWORD 0f0e0d0c0b0a0504h, 0ffffffffffffffffh + QWORD 0d0c0b0a05040100h, 0ffffffffffff0f0eh + QWORD 0d0c0b0a05040302h, 0ffffffffffff0f0eh + QWORD 0b0a050403020100h, 0ffffffff0f0e0d0ch + QWORD 0f0e0d0c0b0a0706h, 0ffffffffffffffffh + QWORD 0d0c0b0a07060100h, 0ffffffffffff0f0eh + QWORD 0d0c0b0a07060302h, 0ffffffffffff0f0eh + QWORD 0b0a070603020100h, 0ffffffff0f0e0d0ch + QWORD 0d0c0b0a07060504h, 0ffffffffffff0f0eh + QWORD 0b0a070605040100h, 0ffffffff0f0e0d0ch + QWORD 0b0a070605040302h, 0ffffffff0f0e0d0ch + QWORD 0706050403020100h, 0ffff0f0e0d0c0b0ah + QWORD 0f0e0d0c0b0a0908h, 0ffffffffffffffffh + QWORD 0d0c0b0a09080100h, 0ffffffffffff0f0eh + QWORD 0d0c0b0a09080302h, 0ffffffffffff0f0eh + QWORD 0b0a090803020100h, 0ffffffff0f0e0d0ch + QWORD 0d0c0b0a09080504h, 0ffffffffffff0f0eh + QWORD 0b0a090805040100h, 0ffffffff0f0e0d0ch + QWORD 0b0a090805040302h, 0ffffffff0f0e0d0ch + QWORD 0908050403020100h, 0ffff0f0e0d0c0b0ah + QWORD 0d0c0b0a09080706h, 0ffffffffffff0f0eh + QWORD 0b0a090807060100h, 0ffffffff0f0e0d0ch + QWORD 0b0a090807060302h, 0ffffffff0f0e0d0ch + QWORD 0908070603020100h, 0ffff0f0e0d0c0b0ah + QWORD 0b0a090807060504h, 0ffffffff0f0e0d0ch + QWORD 0908070605040100h, 0ffff0f0e0d0c0b0ah + QWORD 0908070605040302h, 0ffff0f0e0d0c0b0ah + QWORD 0706050403020100h, 0f0e0d0c0b0a0908h +ptr_L_mldsa_shufb_rej_idx QWORD L_mldsa_shufb_rej_idx +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta2_mask_nibbles WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh + WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh +ptr_L_mldsa_extract_coeffs_eta2_mask_nibbles QWORD L_mldsa_extract_coeffs_eta2_mask_nibbles +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta2_mul WORD 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h + WORD 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h, 3340h +ptr_L_mldsa_extract_coeffs_eta2_mul QWORD L_mldsa_extract_coeffs_eta2_mul +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta2_five WORD 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h + WORD 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h, 0005h +ptr_L_mldsa_extract_coeffs_eta2_five QWORD L_mldsa_extract_coeffs_eta2_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta2_two WORD 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h + WORD 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h, 0002h +ptr_L_mldsa_extract_coeffs_eta2_two QWORD L_mldsa_extract_coeffs_eta2_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta2_nibble_table DWORD 00000002h, 00000001h, 00000000h, 0ffffffffh + DWORD 0fffffffeh, 00000002h, 00000001h, 00000000h + DWORD 0ffffffffh, 0fffffffeh, 00000002h, 00000001h + DWORD 00000000h, 0ffffffffh, 0fffffffeh, 00000000h +ptr_L_mldsa_extract_coeffs_eta2_nibble_table QWORD L_mldsa_extract_coeffs_eta2_nibble_table +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_extract_coeffs_eta2_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu ymm6, YMMWORD PTR L_mldsa_extract_coeffs_eta2_mask_nibbles + vmovdqu ymm7, YMMWORD PTR L_mldsa_extract_coeffs_eta2_mul + vmovdqu ymm8, YMMWORD PTR L_mldsa_extract_coeffs_eta2_five + vmovdqu ymm9, YMMWORD PTR L_mldsa_extract_coeffs_eta2_two + mov r15, QWORD PTR [ptr_L_mldsa_shufb_rej_idx] + mov r10d, DWORD PTR [r9] + cmp r10d, 0 + jne L_mldsa_extract_coeffs_eta2_less_than_256 + vpmovzxbd ymm0, QWORD PTR [rcx] + vpmovzxbd ymm1, QWORD PTR [rcx+8] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+16] + vpmovzxbd ymm1, QWORD PTR [rcx+24] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+32] + vpmovzxbd ymm1, QWORD PTR [rcx+40] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+48] + vpmovzxbd ymm1, QWORD PTR [rcx+56] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+64] + vpmovzxbd ymm1, QWORD PTR [rcx+72] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+80] + vpmovzxbd ymm1, QWORD PTR [rcx+88] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+96] + vpmovzxbd ymm1, QWORD PTR [rcx+104] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+112] + vpmovzxbd ymm1, QWORD PTR [rcx+120] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm3, YMMWORD PTR [r15+rbx] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vmovdqu ymm5, YMMWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpmulhw ymm2, ymm0, ymm7 + vpmulhw ymm3, ymm1, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + sub edx, 128 + add rcx, 128 +L_mldsa_extract_coeffs_eta2_less_than_256: + cmp r10d, 240 + jg L_mldsa_extract_coeffs_eta2_less_than_ymm +L_mldsa_extract_coeffs_eta2_start_one_ymm: + vpmovzxbd ymm0, QWORD PTR [rcx] + vpslld ymm2, ymm0, 12 + vpor ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm6 + vpcmpgtw ymm2, ymm6, ymm0 + vpacksswb ymm2, ymm2, ymm2 + vpmovmskb eax, ymm2 + mov r13d, eax + movzx r12d, al + shr r13, 16 + and r13d, 255 + shl r12d, 4 + shl r13d, 4 + vmovdqu ymm2, YMMWORD PTR [r15+r12] + vmovdqu ymm4, YMMWORD PTR [r15+r13] + vinserti128 ymm2, ymm2, xmm4, 1 + vpshufb ymm0, ymm0, ymm2 + popcnt r12d, r12d + popcnt r13d, r13d + vpmulhw ymm2, ymm0, ymm7 + vpmullw ymm2, ymm2, ymm8 + vpsubw ymm0, ymm9, ymm0 + vpaddw ymm0, ymm0, ymm2 + vpmovsxwd ymm2, xmm0 + vextracti128 xmm0, ymm0, 1 + vpmovsxwd ymm0, xmm0 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + sub edx, 8 + add rcx, 8 + cmp edx, 8 + jl L_mldsa_extract_coeffs_eta2_less_than_ymm + cmp r10d, 240 + jle L_mldsa_extract_coeffs_eta2_start_one_ymm +L_mldsa_extract_coeffs_eta2_less_than_ymm: + cmp r10d, 256 + je L_mldsa_extract_coeffs_eta2_done +L_mldsa_extract_coeffs_eta2_start_byte: + mov r15, QWORD PTR [ptr_L_mldsa_extract_coeffs_eta2_nibble_table] + cmp edx, 0 + je L_mldsa_extract_coeffs_eta2_done + movzx ebx, BYTE PTR [rcx] + add rcx, 1 + sub edx, 1 + mov eax, ebx + shr eax, 4 + and bl, 15 + xor r13, r13 + cmp bl, 15 + adc r13d, 0 + mov r14d, DWORD PTR [r15+4*rbx] + mov DWORD PTR [r8], r14d + add r10d, r13d + shl r13d, 2 + add r8, r13 + cmp r10d, 256 + je L_mldsa_extract_coeffs_eta2_done + xor r13, r13 + cmp al, 15 + adc r13d, 0 + mov r14d, DWORD PTR [r15+4*rax] + mov DWORD PTR [r8], r14d + add r10d, r13d + shl r13d, 2 + add r8, r13 + cmp r10d, 256 + je L_mldsa_extract_coeffs_eta2_done + jmp L_mldsa_extract_coeffs_eta2_start_byte +L_mldsa_extract_coeffs_eta2_done: + mov DWORD PTR [r9], r10d + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +wc_mldsa_extract_coeffs_eta2_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta4_mask_nibbles WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh + WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh +ptr_L_mldsa_extract_coeffs_eta4_mask_nibbles QWORD L_mldsa_extract_coeffs_eta4_mask_nibbles +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta4_nine WORD 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h + WORD 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h, 0009h +ptr_L_mldsa_extract_coeffs_eta4_nine QWORD L_mldsa_extract_coeffs_eta4_nine +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_extract_coeffs_eta4_four WORD 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h + WORD 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h, 0004h +ptr_L_mldsa_extract_coeffs_eta4_four QWORD L_mldsa_extract_coeffs_eta4_four +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_extract_coeffs_eta4_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp, 48 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu ymm6, YMMWORD PTR L_mldsa_extract_coeffs_eta4_mask_nibbles + vmovdqu ymm7, YMMWORD PTR L_mldsa_extract_coeffs_eta4_nine + vmovdqu ymm8, YMMWORD PTR L_mldsa_extract_coeffs_eta4_four + mov r15, QWORD PTR [ptr_L_mldsa_shufb_rej_idx] + mov r10d, DWORD PTR [r9] + cmp r10d, 0 + jne L_mldsa_extract_coeffs_eta4_less_than_256 + vpmovzxbd ymm0, QWORD PTR [rcx] + vpmovzxbd ymm1, QWORD PTR [rcx+8] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+16] + vpmovzxbd ymm1, QWORD PTR [rcx+24] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+32] + vpmovzxbd ymm1, QWORD PTR [rcx+40] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+48] + vpmovzxbd ymm1, QWORD PTR [rcx+56] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+64] + vpmovzxbd ymm1, QWORD PTR [rcx+72] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+80] + vpmovzxbd ymm1, QWORD PTR [rcx+88] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+96] + vpmovzxbd ymm1, QWORD PTR [rcx+104] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + vpmovzxbd ymm0, QWORD PTR [rcx+112] + vpmovzxbd ymm1, QWORD PTR [rcx+120] + vpslld ymm2, ymm0, 12 + vpslld ymm3, ymm1, 12 + vpor ymm0, ymm0, ymm2 + vpor ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm6 + vpand ymm1, ymm1, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpcmpgtw ymm3, ymm7, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb eax, ymm2 + mov r13d, eax + mov r14d, eax + movzx r12d, al + movzx ebx, ah + shr r13, 16 + shr r14, 24 + and r13d, 255 + shl r12d, 4 + shl ebx, 4 + shl r13d, 4 + shl r14d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm3, OWORD PTR [r15+rbx] + vmovdqu xmm4, OWORD PTR [r15+r13] + vmovdqu xmm5, OWORD PTR [r15+r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + popcnt r12d, r12d + popcnt ebx, ebx + popcnt r13d, r13d + popcnt r14d, r14d + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm8, ymm1 + vpmovsxwd ymm2, xmm0 + vpmovsxwd ymm3, xmm1 + vextracti128 xmm0, ymm0, 1 + vextracti128 xmm1, ymm1, 1 + vpmovsxwd ymm0, xmm0 + vpmovsxwd ymm1, xmm1 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + vmovdqu YMMWORD PTR [r8], ymm3 + lea r8, QWORD PTR [r8+4*rbx] + add r10d, ebx + vmovdqu YMMWORD PTR [r8], ymm1 + lea r8, QWORD PTR [r8+4*r14] + add r10d, r14d + sub edx, 128 + add rcx, 128 +L_mldsa_extract_coeffs_eta4_less_than_256: + cmp r10d, 240 + jg L_mldsa_extract_coeffs_eta4_less_than_ymm +L_mldsa_extract_coeffs_eta4_start_one_ymm: + vpmovzxbd ymm0, QWORD PTR [rcx] + vpslld ymm2, ymm0, 12 + vpor ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm6 + vpcmpgtw ymm2, ymm7, ymm0 + vpacksswb ymm2, ymm2, ymm2 + vpmovmskb eax, ymm2 + mov r13d, eax + movzx r12d, al + shr r13, 16 + and r13d, 255 + shl r12d, 4 + shl r13d, 4 + vmovdqu xmm2, OWORD PTR [r15+r12] + vmovdqu xmm4, OWORD PTR [r15+r13] + vinserti128 ymm2, ymm2, xmm4, 1 + vpshufb ymm0, ymm0, ymm2 + popcnt r12d, r12d + popcnt r13d, r13d + vpsubw ymm0, ymm8, ymm0 + vpmovsxwd ymm2, xmm0 + vextracti128 xmm0, ymm0, 1 + vpmovsxwd ymm0, xmm0 + vmovdqu YMMWORD PTR [r8], ymm2 + lea r8, QWORD PTR [r8+4*r12] + add r10d, r12d + vmovdqu YMMWORD PTR [r8], ymm0 + lea r8, QWORD PTR [r8+4*r13] + add r10d, r13d + sub edx, 8 + add rcx, 8 + cmp edx, 8 + jl L_mldsa_extract_coeffs_eta4_less_than_ymm + cmp r10d, 240 + jle L_mldsa_extract_coeffs_eta4_start_one_ymm +L_mldsa_extract_coeffs_eta4_less_than_ymm: + cmp r10d, 256 + je L_mldsa_extract_coeffs_eta4_done +L_mldsa_extract_coeffs_eta4_start_byte: + cmp edx, 0 + je L_mldsa_extract_coeffs_eta4_done + movzx ebx, BYTE PTR [rcx] + add rcx, 1 + sub edx, 1 + mov eax, ebx + shr eax, 4 + and bl, 15 + xor r13, r13 + mov r14, 4 + cmp bl, 9 + adc r13d, 0 + sub r14d, ebx + mov DWORD PTR [r8], r14d + add r10d, r13d + shl r13d, 2 + add r8, r13 + cmp r10d, 256 + je L_mldsa_extract_coeffs_eta4_done + xor r13, r13 + mov r14, 4 + cmp al, 9 + adc r13d, 0 + sub r14d, eax + mov DWORD PTR [r8], r14d + add r10d, r13d + shl r13d, 2 + add r8, r13 + cmp r10d, 256 + je L_mldsa_extract_coeffs_eta4_done + jmp L_mldsa_extract_coeffs_eta4_start_byte +L_mldsa_extract_coeffs_eta4_done: + mov DWORD PTR [r9], r10d + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +wc_mldsa_extract_coeffs_eta4_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_redistribute_21_rand_avx2 PROC + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vmovdqu ymm8, YMMWORD PTR [rcx+256] + vmovdqu ymm9, YMMWORD PTR [rcx+288] + vmovdqu ymm10, YMMWORD PTR [rcx+320] + vmovdqu ymm11, YMMWORD PTR [rcx+352] + vpunpcklqdq ymm12, ymm0, ymm1 + vpunpckhqdq ymm13, ymm0, ymm1 + vpunpcklqdq ymm14, ymm2, ymm3 + vpunpckhqdq ymm15, ymm2, ymm3 + vperm2i128 ymm0, ymm12, ymm14, 32 + vperm2i128 ymm1, ymm13, ymm15, 32 + vperm2i128 ymm2, ymm12, ymm14, 49 + vperm2i128 ymm3, ymm13, ymm15, 49 + vpunpcklqdq ymm12, ymm4, ymm5 + vpunpckhqdq ymm13, ymm4, ymm5 + vpunpcklqdq ymm14, ymm6, ymm7 + vpunpckhqdq ymm15, ymm6, ymm7 + vperm2i128 ymm4, ymm12, ymm14, 32 + vperm2i128 ymm5, ymm13, ymm15, 32 + vperm2i128 ymm6, ymm12, ymm14, 49 + vperm2i128 ymm7, ymm13, ymm15, 49 + vpunpcklqdq ymm12, ymm8, ymm9 + vpunpckhqdq ymm13, ymm8, ymm9 + vpunpcklqdq ymm14, ymm10, ymm11 + vpunpckhqdq ymm15, ymm10, ymm11 + vperm2i128 ymm8, ymm12, ymm14, 32 + vperm2i128 ymm9, ymm13, ymm15, 32 + vperm2i128 ymm10, ymm12, ymm14, 49 + vperm2i128 ymm11, ymm13, ymm15, 49 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + vmovdqu YMMWORD PTR [rdx+64], ymm8 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm5 + vmovdqu YMMWORD PTR [r8+64], ymm9 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [r9+64], ymm10 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm7 + vmovdqu YMMWORD PTR [rax+64], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vmovdqu ymm4, YMMWORD PTR [rcx+512] + vmovdqu ymm5, YMMWORD PTR [rcx+544] + vmovdqu ymm6, YMMWORD PTR [rcx+576] + vmovdqu ymm7, YMMWORD PTR [rcx+608] + mov r10, QWORD PTR [rcx+640] + mov r11, QWORD PTR [rcx+648] + mov r12, QWORD PTR [rcx+656] + mov r13, QWORD PTR [rcx+664] + vpunpcklqdq ymm12, ymm0, ymm1 + vpunpckhqdq ymm13, ymm0, ymm1 + vpunpcklqdq ymm14, ymm2, ymm3 + vpunpckhqdq ymm15, ymm2, ymm3 + vperm2i128 ymm0, ymm12, ymm14, 32 + vperm2i128 ymm1, ymm13, ymm15, 32 + vperm2i128 ymm2, ymm12, ymm14, 49 + vperm2i128 ymm3, ymm13, ymm15, 49 + vpunpcklqdq ymm12, ymm4, ymm5 + vpunpckhqdq ymm13, ymm4, ymm5 + vpunpcklqdq ymm14, ymm6, ymm7 + vpunpckhqdq ymm15, ymm6, ymm7 + vperm2i128 ymm4, ymm12, ymm14, 32 + vperm2i128 ymm5, ymm13, ymm15, 32 + vperm2i128 ymm6, ymm12, ymm14, 49 + vperm2i128 ymm7, ymm13, ymm15, 49 + vmovdqu YMMWORD PTR [rdx+96], ymm0 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + mov QWORD PTR [rdx+160], r10 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm5 + mov QWORD PTR [r8+160], r11 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm6 + mov QWORD PTR [r9+160], r12 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm7 + mov QWORD PTR [rax+160], r13 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r13 + pop r12 + ret +wc_mldsa_redistribute_21_rand_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_redistribute_17_rand_avx2 PROC + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm9, ymm0, ymm1 + vpunpcklqdq ymm10, ymm2, ymm3 + vpunpckhqdq ymm11, ymm2, ymm3 + vperm2i128 ymm0, ymm8, ymm10, 32 + vperm2i128 ymm1, ymm9, ymm11, 32 + vperm2i128 ymm2, ymm8, ymm10, 49 + vperm2i128 ymm3, ymm9, ymm11, 49 + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm9, ymm4, ymm5 + vpunpcklqdq ymm10, ymm6, ymm7 + vpunpckhqdq ymm11, ymm6, ymm7 + vperm2i128 ymm4, ymm8, ymm10, 32 + vperm2i128 ymm5, ymm9, ymm11, 32 + vperm2i128 ymm6, ymm8, ymm10, 49 + vperm2i128 ymm7, ymm9, ymm11, 49 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm5 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + mov r10, QWORD PTR [rcx+512] + mov r11, QWORD PTR [rcx+520] + mov r12, QWORD PTR [rcx+528] + mov r13, QWORD PTR [rcx+536] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm9, ymm0, ymm1 + vpunpcklqdq ymm10, ymm2, ymm3 + vpunpckhqdq ymm11, ymm2, ymm3 + vperm2i128 ymm0, ymm8, ymm10, 32 + vperm2i128 ymm1, ymm9, ymm11, 32 + vperm2i128 ymm2, ymm8, ymm10, 49 + vperm2i128 ymm3, ymm9, ymm11, 49 + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm9, ymm4, ymm5 + vpunpcklqdq ymm10, ymm6, ymm7 + vpunpckhqdq ymm11, ymm6, ymm7 + vperm2i128 ymm4, ymm8, ymm10, 32 + vperm2i128 ymm5, ymm9, ymm11, 32 + vperm2i128 ymm6, ymm8, ymm10, 49 + vperm2i128 ymm7, ymm9, ymm11, 49 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm4 + mov QWORD PTR [rdx+128], r10 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm5 + mov QWORD PTR [r8+128], r11 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm6 + mov QWORD PTR [r9+128], r12 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm7 + mov QWORD PTR [rax+128], r13 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop r13 + pop r12 + ret +wc_mldsa_redistribute_17_rand_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_2_avx2_two DWORD 00000002h, 00000002h, 00000002h, 00000002h + DWORD 00000002h, 00000002h, 00000002h, 00000002h +ptr_L_mldsa_encode_eta_2_avx2_two QWORD L_mldsa_encode_eta_2_avx2_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_2_avx2_vs_3 DWORD 00000000h, 00000003h, 00000006h, 00000009h + DWORD 00000004h, 00000007h, 0000000ah, 0000000dh +ptr_L_mldsa_encode_eta_2_avx2_vs_3 QWORD L_mldsa_encode_eta_2_avx2_vs_3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_2_avx2_shuff_3_even BYTE 00h, 0ffh, 04h, 05h, 08h, 0ffh, 0ch, 0dh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 00h, 0ffh, 04h, 05h, 08h, 0ffh, 0ch, 0dh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_eta_2_avx2_shuff_3_even QWORD L_mldsa_encode_eta_2_avx2_shuff_3_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_2_avx2_shuff_3_odd BYTE 02h, 0ffh, 0ffh, 07h, 0ah, 0bh, 0ffh, 0fh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 02h, 0ffh, 0ffh, 07h, 0ah, 0bh, 0ffh, 0fh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_eta_2_avx2_shuff_3_odd QWORD L_mldsa_encode_eta_2_avx2_shuff_3_odd +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_2_avx2_shuff_6_even BYTE 00h, 04h, 05h, 08h, 0ch, 0dh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 04h + BYTE 05h, 08h, 0ch, 0dh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_eta_2_avx2_shuff_6_even QWORD L_mldsa_encode_eta_2_avx2_shuff_6_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_2_avx2_shuff_6_odd BYTE 02h, 03h, 07h, 0ah, 0bh, 0fh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 02h, 03h + BYTE 07h, 0ah, 0bh, 0fh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_eta_2_avx2_shuff_6_odd QWORD L_mldsa_encode_eta_2_avx2_shuff_6_odd +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_vec_encode_eta_2_avx2 PROC + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu ymm6, YMMWORD PTR L_mldsa_encode_eta_2_avx2_two + vmovdqu ymm7, YMMWORD PTR L_mldsa_encode_eta_2_avx2_vs_3 + vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_3_even + vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_3_odd + vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_6_even + vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_eta_2_avx2_shuff_6_odd +L_mldsa_encode_eta_2_avx2_loop: + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vmovdqu ymm2, YMMWORD PTR [rcx+704] + vmovdqu ymm3, YMMWORD PTR [rcx+736] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vmovdqu ymm2, YMMWORD PTR [rcx+960] + vmovdqu ymm3, YMMWORD PTR [rcx+992] + vpsubd ymm0, ymm6, ymm0 + vpsubd ymm1, ymm6, ymm1 + vpsubd ymm2, ymm6, ymm2 + vpsubd ymm3, ymm6, ymm3 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpackusdw ymm0, ymm0, ymm2 + vpackusdw ymm1, ymm1, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpshufb ymm4, ymm0, ymm9 + vpshufb ymm5, ymm1, ymm9 + vpshufb ymm0, ymm0, ymm8 + vpshufb ymm1, ymm1, ymm8 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpermq ymm1, ymm1, 177 + vpor ymm0, ymm0, ymm1 + vpshufb ymm4, ymm0, ymm11 + vpshufb ymm0, ymm0, ymm10 + vpor ymm0, ymm0, ymm4 + vextracti128 xmm4, ymm0, 1 + vpor ymm0, ymm0, ymm4 + vmovdqu OWORD PTR [r8], xmm0 + add r8, 12 + add rcx, 1024 + dec dl + jnz L_mldsa_encode_eta_2_avx2_loop + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + ret +wc_mldsa_vec_encode_eta_2_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_4_avx2_four DWORD 00000004h, 00000004h, 00000004h, 00000004h + DWORD 00000004h, 00000004h, 00000004h, 00000004h +ptr_L_mldsa_encode_eta_4_avx2_four QWORD L_mldsa_encode_eta_4_avx2_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_eta_4_avx2_vs_4 DWORD 00000000h, 00000004h, 00000000h, 00000004h + DWORD 00000000h, 00000004h, 00000000h, 00000004h +ptr_L_mldsa_encode_eta_4_avx2_vs_4 QWORD L_mldsa_encode_eta_4_avx2_vs_4 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_vec_encode_eta_4_avx2 PROC + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_eta_4_avx2_four + vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_eta_4_avx2_vs_4 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+32], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+96], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+1024] + vmovdqu ymm1, YMMWORD PTR [rcx+1056] + vmovdqu ymm2, YMMWORD PTR [rcx+1088] + vmovdqu ymm3, YMMWORD PTR [rcx+1120] + vmovdqu ymm4, YMMWORD PTR [rcx+1152] + vmovdqu ymm5, YMMWORD PTR [rcx+1184] + vmovdqu ymm6, YMMWORD PTR [rcx+1216] + vmovdqu ymm7, YMMWORD PTR [rcx+1248] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+1280] + vmovdqu ymm1, YMMWORD PTR [rcx+1312] + vmovdqu ymm2, YMMWORD PTR [rcx+1344] + vmovdqu ymm3, YMMWORD PTR [rcx+1376] + vmovdqu ymm4, YMMWORD PTR [rcx+1408] + vmovdqu ymm5, YMMWORD PTR [rcx+1440] + vmovdqu ymm6, YMMWORD PTR [rcx+1472] + vmovdqu ymm7, YMMWORD PTR [rcx+1504] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+160], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+1536] + vmovdqu ymm1, YMMWORD PTR [rcx+1568] + vmovdqu ymm2, YMMWORD PTR [rcx+1600] + vmovdqu ymm3, YMMWORD PTR [rcx+1632] + vmovdqu ymm4, YMMWORD PTR [rcx+1664] + vmovdqu ymm5, YMMWORD PTR [rcx+1696] + vmovdqu ymm6, YMMWORD PTR [rcx+1728] + vmovdqu ymm7, YMMWORD PTR [rcx+1760] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+192], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+1792] + vmovdqu ymm1, YMMWORD PTR [rcx+1824] + vmovdqu ymm2, YMMWORD PTR [rcx+1856] + vmovdqu ymm3, YMMWORD PTR [rcx+1888] + vmovdqu ymm4, YMMWORD PTR [rcx+1920] + vmovdqu ymm5, YMMWORD PTR [rcx+1952] + vmovdqu ymm6, YMMWORD PTR [rcx+1984] + vmovdqu ymm7, YMMWORD PTR [rcx+2016] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+224], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+2048] + vmovdqu ymm1, YMMWORD PTR [rcx+2080] + vmovdqu ymm2, YMMWORD PTR [rcx+2112] + vmovdqu ymm3, YMMWORD PTR [rcx+2144] + vmovdqu ymm4, YMMWORD PTR [rcx+2176] + vmovdqu ymm5, YMMWORD PTR [rcx+2208] + vmovdqu ymm6, YMMWORD PTR [rcx+2240] + vmovdqu ymm7, YMMWORD PTR [rcx+2272] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+2304] + vmovdqu ymm1, YMMWORD PTR [rcx+2336] + vmovdqu ymm2, YMMWORD PTR [rcx+2368] + vmovdqu ymm3, YMMWORD PTR [rcx+2400] + vmovdqu ymm4, YMMWORD PTR [rcx+2432] + vmovdqu ymm5, YMMWORD PTR [rcx+2464] + vmovdqu ymm6, YMMWORD PTR [rcx+2496] + vmovdqu ymm7, YMMWORD PTR [rcx+2528] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+288], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+2560] + vmovdqu ymm1, YMMWORD PTR [rcx+2592] + vmovdqu ymm2, YMMWORD PTR [rcx+2624] + vmovdqu ymm3, YMMWORD PTR [rcx+2656] + vmovdqu ymm4, YMMWORD PTR [rcx+2688] + vmovdqu ymm5, YMMWORD PTR [rcx+2720] + vmovdqu ymm6, YMMWORD PTR [rcx+2752] + vmovdqu ymm7, YMMWORD PTR [rcx+2784] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+320], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+2816] + vmovdqu ymm1, YMMWORD PTR [rcx+2848] + vmovdqu ymm2, YMMWORD PTR [rcx+2880] + vmovdqu ymm3, YMMWORD PTR [rcx+2912] + vmovdqu ymm4, YMMWORD PTR [rcx+2944] + vmovdqu ymm5, YMMWORD PTR [rcx+2976] + vmovdqu ymm6, YMMWORD PTR [rcx+3008] + vmovdqu ymm7, YMMWORD PTR [rcx+3040] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+352], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+3072] + vmovdqu ymm1, YMMWORD PTR [rcx+3104] + vmovdqu ymm2, YMMWORD PTR [rcx+3136] + vmovdqu ymm3, YMMWORD PTR [rcx+3168] + vmovdqu ymm4, YMMWORD PTR [rcx+3200] + vmovdqu ymm5, YMMWORD PTR [rcx+3232] + vmovdqu ymm6, YMMWORD PTR [rcx+3264] + vmovdqu ymm7, YMMWORD PTR [rcx+3296] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+3328] + vmovdqu ymm1, YMMWORD PTR [rcx+3360] + vmovdqu ymm2, YMMWORD PTR [rcx+3392] + vmovdqu ymm3, YMMWORD PTR [rcx+3424] + vmovdqu ymm4, YMMWORD PTR [rcx+3456] + vmovdqu ymm5, YMMWORD PTR [rcx+3488] + vmovdqu ymm6, YMMWORD PTR [rcx+3520] + vmovdqu ymm7, YMMWORD PTR [rcx+3552] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+416], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+3584] + vmovdqu ymm1, YMMWORD PTR [rcx+3616] + vmovdqu ymm2, YMMWORD PTR [rcx+3648] + vmovdqu ymm3, YMMWORD PTR [rcx+3680] + vmovdqu ymm4, YMMWORD PTR [rcx+3712] + vmovdqu ymm5, YMMWORD PTR [rcx+3744] + vmovdqu ymm6, YMMWORD PTR [rcx+3776] + vmovdqu ymm7, YMMWORD PTR [rcx+3808] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+448], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+3840] + vmovdqu ymm1, YMMWORD PTR [rcx+3872] + vmovdqu ymm2, YMMWORD PTR [rcx+3904] + vmovdqu ymm3, YMMWORD PTR [rcx+3936] + vmovdqu ymm4, YMMWORD PTR [rcx+3968] + vmovdqu ymm5, YMMWORD PTR [rcx+4000] + vmovdqu ymm6, YMMWORD PTR [rcx+4032] + vmovdqu ymm7, YMMWORD PTR [rcx+4064] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+480], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+4096] + vmovdqu ymm1, YMMWORD PTR [rcx+4128] + vmovdqu ymm2, YMMWORD PTR [rcx+4160] + vmovdqu ymm3, YMMWORD PTR [rcx+4192] + vmovdqu ymm4, YMMWORD PTR [rcx+4224] + vmovdqu ymm5, YMMWORD PTR [rcx+4256] + vmovdqu ymm6, YMMWORD PTR [rcx+4288] + vmovdqu ymm7, YMMWORD PTR [rcx+4320] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+512], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+4352] + vmovdqu ymm1, YMMWORD PTR [rcx+4384] + vmovdqu ymm2, YMMWORD PTR [rcx+4416] + vmovdqu ymm3, YMMWORD PTR [rcx+4448] + vmovdqu ymm4, YMMWORD PTR [rcx+4480] + vmovdqu ymm5, YMMWORD PTR [rcx+4512] + vmovdqu ymm6, YMMWORD PTR [rcx+4544] + vmovdqu ymm7, YMMWORD PTR [rcx+4576] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+544], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+4608] + vmovdqu ymm1, YMMWORD PTR [rcx+4640] + vmovdqu ymm2, YMMWORD PTR [rcx+4672] + vmovdqu ymm3, YMMWORD PTR [rcx+4704] + vmovdqu ymm4, YMMWORD PTR [rcx+4736] + vmovdqu ymm5, YMMWORD PTR [rcx+4768] + vmovdqu ymm6, YMMWORD PTR [rcx+4800] + vmovdqu ymm7, YMMWORD PTR [rcx+4832] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+576], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+4864] + vmovdqu ymm1, YMMWORD PTR [rcx+4896] + vmovdqu ymm2, YMMWORD PTR [rcx+4928] + vmovdqu ymm3, YMMWORD PTR [rcx+4960] + vmovdqu ymm4, YMMWORD PTR [rcx+4992] + vmovdqu ymm5, YMMWORD PTR [rcx+5024] + vmovdqu ymm6, YMMWORD PTR [rcx+5056] + vmovdqu ymm7, YMMWORD PTR [rcx+5088] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+608], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+5120] + vmovdqu ymm1, YMMWORD PTR [rcx+5152] + vmovdqu ymm2, YMMWORD PTR [rcx+5184] + vmovdqu ymm3, YMMWORD PTR [rcx+5216] + vmovdqu ymm4, YMMWORD PTR [rcx+5248] + vmovdqu ymm5, YMMWORD PTR [rcx+5280] + vmovdqu ymm6, YMMWORD PTR [rcx+5312] + vmovdqu ymm7, YMMWORD PTR [rcx+5344] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+640], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+5376] + vmovdqu ymm1, YMMWORD PTR [rcx+5408] + vmovdqu ymm2, YMMWORD PTR [rcx+5440] + vmovdqu ymm3, YMMWORD PTR [rcx+5472] + vmovdqu ymm4, YMMWORD PTR [rcx+5504] + vmovdqu ymm5, YMMWORD PTR [rcx+5536] + vmovdqu ymm6, YMMWORD PTR [rcx+5568] + vmovdqu ymm7, YMMWORD PTR [rcx+5600] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+672], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+5632] + vmovdqu ymm1, YMMWORD PTR [rcx+5664] + vmovdqu ymm2, YMMWORD PTR [rcx+5696] + vmovdqu ymm3, YMMWORD PTR [rcx+5728] + vmovdqu ymm4, YMMWORD PTR [rcx+5760] + vmovdqu ymm5, YMMWORD PTR [rcx+5792] + vmovdqu ymm6, YMMWORD PTR [rcx+5824] + vmovdqu ymm7, YMMWORD PTR [rcx+5856] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+704], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+5888] + vmovdqu ymm1, YMMWORD PTR [rcx+5920] + vmovdqu ymm2, YMMWORD PTR [rcx+5952] + vmovdqu ymm3, YMMWORD PTR [rcx+5984] + vmovdqu ymm4, YMMWORD PTR [rcx+6016] + vmovdqu ymm5, YMMWORD PTR [rcx+6048] + vmovdqu ymm6, YMMWORD PTR [rcx+6080] + vmovdqu ymm7, YMMWORD PTR [rcx+6112] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vpsllvd ymm0, ymm0, ymm9 + vpsllvd ymm1, ymm1, ymm9 + vpsllvd ymm2, ymm2, ymm9 + vpsllvd ymm3, ymm3, ymm9 + vpsllvd ymm4, ymm4, ymm9 + vpsllvd ymm5, ymm5, ymm9 + vpsllvd ymm6, ymm6, ymm9 + vpsllvd ymm7, ymm7, ymm9 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+736], ymm0 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + ret +wc_mldsa_vec_encode_eta_4_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_0 BYTE 00h, 0ffh, 0ffh, 0ffh, 00h, 0ffh, 0ffh, 0ffh + BYTE 00h, 01h, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh + BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 0ffh, 0ffh + BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_0 QWORD L_mldsa_decode_eta_2_avx2_shuff_0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_1 BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh + BYTE 01h, 02h, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh + BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 0ffh, 0ffh + BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_1 QWORD L_mldsa_decode_eta_2_avx2_shuff_1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_2 BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh + BYTE 02h, 03h, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh + BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh, 0ffh + BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_2 QWORD L_mldsa_decode_eta_2_avx2_shuff_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_3 BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh + BYTE 03h, 04h, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh + BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh, 0ffh + BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_3 QWORD L_mldsa_decode_eta_2_avx2_shuff_3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_4 BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh + BYTE 04h, 05h, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh + BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh, 0ffh + BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_4 QWORD L_mldsa_decode_eta_2_avx2_shuff_4 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_5 BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh + BYTE 05h, 06h, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh + BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh, 0ffh + BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_5 QWORD L_mldsa_decode_eta_2_avx2_shuff_5 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_6 BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh + BYTE 06h, 07h, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh + BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh, 0ffh + BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_6 QWORD L_mldsa_decode_eta_2_avx2_shuff_6 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_shuff_7 BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh + BYTE 07h, 08h, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh + BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh, 0ffh + BYTE 09h, 0ffh, 0ffh, 0ffh, 09h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_2_avx2_shuff_7 QWORD L_mldsa_decode_eta_2_avx2_shuff_7 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_two DWORD 00000002h, 00000002h, 00000002h, 00000002h + DWORD 00000002h, 00000002h, 00000002h, 00000002h +ptr_L_mldsa_decode_eta_2_avx2_two QWORD L_mldsa_decode_eta_2_avx2_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_vs DWORD 00000000h, 00000003h, 00000006h, 00000001h + DWORD 00000004h, 00000007h, 00000002h, 00000005h +ptr_L_mldsa_decode_eta_2_avx2_vs QWORD L_mldsa_decode_eta_2_avx2_vs +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_2_avx2_mask DWORD 00000007h, 00000007h, 00000007h, 00000007h + DWORD 00000007h, 00000007h, 00000007h, 00000007h +ptr_L_mldsa_decode_eta_2_avx2_mask QWORD L_mldsa_decode_eta_2_avx2_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decode_eta_2_avx2 PROC + sub rsp, 144 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vpxor ymm4, ymm4, ymm4 + vmovdqu ymm4, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_0 + vmovdqu ymm5, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_1 + vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_2 + vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_3 + vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_4 + vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_5 + vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_6 + vmovdqu ymm11, YMMWORD PTR L_mldsa_decode_eta_2_avx2_shuff_7 + vmovdqu ymm12, YMMWORD PTR L_mldsa_decode_eta_2_avx2_two + vmovdqu ymm13, YMMWORD PTR L_mldsa_decode_eta_2_avx2_vs + vmovdqu ymm14, YMMWORD PTR L_mldsa_decode_eta_2_avx2_mask + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm4 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx], ymm3 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm7 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+32], ymm3 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm10 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+64], ymm3 + vpermq ymm0, ymm0, 57 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm8 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+128], ymm3 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm11 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+160], ymm3 + vpermq ymm0, ymm0, 57 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm6 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+192], ymm3 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm9 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+224], ymm3 + vpermq ymm0, ymm0, 57 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm4 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+256], ymm3 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm7 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+288], ymm3 + vperm2i128 ymm0, ymm0, ymm1, 32 + vpermq ymm0, ymm0, 56 + vpermq ymm3, ymm0, 68 + vpshufb ymm3, ymm3, ymm10 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+320], ymm3 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+352], ymm3 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm8 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+384], ymm3 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm11 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+416], ymm3 + vpermq ymm1, ymm1, 57 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm6 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+448], ymm3 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm9 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+480], ymm3 + vpermq ymm1, ymm1, 57 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm4 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+512], ymm3 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm7 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+544], ymm3 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm10 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+576], ymm3 + vpermq ymm1, ymm1, 57 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+608], ymm3 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm8 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+640], ymm3 + vperm2i128 ymm1, ymm1, ymm2, 32 + vpermq ymm1, ymm1, 56 + vpermq ymm3, ymm1, 68 + vpshufb ymm3, ymm3, ymm11 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+672], ymm3 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm6 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+704], ymm3 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm9 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+736], ymm3 + vpermq ymm2, ymm2, 57 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm4 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+768], ymm3 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm7 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+800], ymm3 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm10 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+832], ymm3 + vpermq ymm2, ymm2, 57 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+864], ymm3 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm8 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+896], ymm3 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm11 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+928], ymm3 + vpermq ymm2, ymm2, 57 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm6 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+960], ymm3 + vpermq ymm3, ymm2, 68 + vpshufb ymm3, ymm3, ymm9 + vpsrlvd ymm3, ymm3, ymm13 + vpand ymm3, ymm3, ymm14 + vpsubd ymm3, ymm12, ymm3 + vmovdqu YMMWORD PTR [rdx+992], ymm3 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + ret +wc_mldsa_decode_eta_2_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_0 BYTE 00h, 0ffh, 0ffh, 0ffh, 00h, 0ffh, 0ffh, 0ffh + BYTE 00h, 01h, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh + BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 0ffh, 0ffh + BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_0 QWORD L_mldsa_decode_eta_4_avx2_shuff_0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_1 BYTE 01h, 0ffh, 0ffh, 0ffh, 01h, 0ffh, 0ffh, 0ffh + BYTE 01h, 02h, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh + BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 0ffh, 0ffh + BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_1 QWORD L_mldsa_decode_eta_4_avx2_shuff_1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_2 BYTE 02h, 0ffh, 0ffh, 0ffh, 02h, 0ffh, 0ffh, 0ffh + BYTE 02h, 03h, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh + BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh, 0ffh + BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_2 QWORD L_mldsa_decode_eta_4_avx2_shuff_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_3 BYTE 03h, 0ffh, 0ffh, 0ffh, 03h, 0ffh, 0ffh, 0ffh + BYTE 03h, 04h, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh + BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh, 0ffh + BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_3 QWORD L_mldsa_decode_eta_4_avx2_shuff_3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_4 BYTE 04h, 0ffh, 0ffh, 0ffh, 04h, 0ffh, 0ffh, 0ffh + BYTE 04h, 05h, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh + BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh, 0ffh + BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_4 QWORD L_mldsa_decode_eta_4_avx2_shuff_4 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_5 BYTE 05h, 0ffh, 0ffh, 0ffh, 05h, 0ffh, 0ffh, 0ffh + BYTE 05h, 06h, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh + BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh, 0ffh + BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_5 QWORD L_mldsa_decode_eta_4_avx2_shuff_5 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_6 BYTE 06h, 0ffh, 0ffh, 0ffh, 06h, 0ffh, 0ffh, 0ffh + BYTE 06h, 07h, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh + BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh, 0ffh + BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_6 QWORD L_mldsa_decode_eta_4_avx2_shuff_6 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_shuff_7 BYTE 07h, 0ffh, 0ffh, 0ffh, 07h, 0ffh, 0ffh, 0ffh + BYTE 07h, 08h, 0ffh, 0ffh, 08h, 0ffh, 0ffh, 0ffh + BYTE 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh, 0ffh + BYTE 09h, 0ffh, 0ffh, 0ffh, 09h, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_decode_eta_4_avx2_shuff_7 QWORD L_mldsa_decode_eta_4_avx2_shuff_7 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_four DWORD 00000004h, 00000004h, 00000004h, 00000004h + DWORD 00000004h, 00000004h, 00000004h, 00000004h +ptr_L_mldsa_decode_eta_4_avx2_four QWORD L_mldsa_decode_eta_4_avx2_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_vs DWORD 00000000h, 00000004h, 00000008h, 0000000ch + DWORD 00000010h, 00000014h, 00000018h, 0000001ch +ptr_L_mldsa_decode_eta_4_avx2_vs QWORD L_mldsa_decode_eta_4_avx2_vs +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_eta_4_avx2_mask DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh + DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh +ptr_L_mldsa_decode_eta_4_avx2_mask QWORD L_mldsa_decode_eta_4_avx2_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decode_eta_4_avx2 PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_eta_4_avx2_four + vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_eta_4_avx2_vs + vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_eta_4_avx2_mask + vpbroadcastd ymm0, DWORD PTR [rcx] + vpbroadcastd ymm1, DWORD PTR [rcx+4] + vpbroadcastd ymm2, DWORD PTR [rcx+8] + vpbroadcastd ymm3, DWORD PTR [rcx+12] + vpbroadcastd ymm4, DWORD PTR [rcx+16] + vpbroadcastd ymm5, DWORD PTR [rcx+20] + vpbroadcastd ymm6, DWORD PTR [rcx+24] + vpbroadcastd ymm7, DWORD PTR [rcx+28] + vpsrlvd ymm0, ymm0, ymm9 + vpsrlvd ymm1, ymm1, ymm9 + vpsrlvd ymm2, ymm2, ymm9 + vpsrlvd ymm3, ymm3, ymm9 + vpsrlvd ymm4, ymm4, ymm9 + vpsrlvd ymm5, ymm5, ymm9 + vpsrlvd ymm6, ymm6, ymm9 + vpsrlvd ymm7, ymm7, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm10 + vpand ymm3, ymm3, ymm10 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm10 + vpand ymm7, ymm7, ymm10 + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + vmovdqu YMMWORD PTR [rdx+160], ymm5 + vmovdqu YMMWORD PTR [rdx+192], ymm6 + vmovdqu YMMWORD PTR [rdx+224], ymm7 + vpbroadcastd ymm0, DWORD PTR [rcx+32] + vpbroadcastd ymm1, DWORD PTR [rcx+36] + vpbroadcastd ymm2, DWORD PTR [rcx+40] + vpbroadcastd ymm3, DWORD PTR [rcx+44] + vpbroadcastd ymm4, DWORD PTR [rcx+48] + vpbroadcastd ymm5, DWORD PTR [rcx+52] + vpbroadcastd ymm6, DWORD PTR [rcx+56] + vpbroadcastd ymm7, DWORD PTR [rcx+60] + vpsrlvd ymm0, ymm0, ymm9 + vpsrlvd ymm1, ymm1, ymm9 + vpsrlvd ymm2, ymm2, ymm9 + vpsrlvd ymm3, ymm3, ymm9 + vpsrlvd ymm4, ymm4, ymm9 + vpsrlvd ymm5, ymm5, ymm9 + vpsrlvd ymm6, ymm6, ymm9 + vpsrlvd ymm7, ymm7, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm10 + vpand ymm3, ymm3, ymm10 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm10 + vpand ymm7, ymm7, ymm10 + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu YMMWORD PTR [rdx+320], ymm2 + vmovdqu YMMWORD PTR [rdx+352], ymm3 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + vmovdqu YMMWORD PTR [rdx+416], ymm5 + vmovdqu YMMWORD PTR [rdx+448], ymm6 + vmovdqu YMMWORD PTR [rdx+480], ymm7 + vpbroadcastd ymm0, DWORD PTR [rcx+64] + vpbroadcastd ymm1, DWORD PTR [rcx+68] + vpbroadcastd ymm2, DWORD PTR [rcx+72] + vpbroadcastd ymm3, DWORD PTR [rcx+76] + vpbroadcastd ymm4, DWORD PTR [rcx+80] + vpbroadcastd ymm5, DWORD PTR [rcx+84] + vpbroadcastd ymm6, DWORD PTR [rcx+88] + vpbroadcastd ymm7, DWORD PTR [rcx+92] + vpsrlvd ymm0, ymm0, ymm9 + vpsrlvd ymm1, ymm1, ymm9 + vpsrlvd ymm2, ymm2, ymm9 + vpsrlvd ymm3, ymm3, ymm9 + vpsrlvd ymm4, ymm4, ymm9 + vpsrlvd ymm5, ymm5, ymm9 + vpsrlvd ymm6, ymm6, ymm9 + vpsrlvd ymm7, ymm7, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm10 + vpand ymm3, ymm3, ymm10 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm10 + vpand ymm7, ymm7, ymm10 + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vmovdqu YMMWORD PTR [rdx+512], ymm0 + vmovdqu YMMWORD PTR [rdx+544], ymm1 + vmovdqu YMMWORD PTR [rdx+576], ymm2 + vmovdqu YMMWORD PTR [rdx+608], ymm3 + vmovdqu YMMWORD PTR [rdx+640], ymm4 + vmovdqu YMMWORD PTR [rdx+672], ymm5 + vmovdqu YMMWORD PTR [rdx+704], ymm6 + vmovdqu YMMWORD PTR [rdx+736], ymm7 + vpbroadcastd ymm0, DWORD PTR [rcx+96] + vpbroadcastd ymm1, DWORD PTR [rcx+100] + vpbroadcastd ymm2, DWORD PTR [rcx+104] + vpbroadcastd ymm3, DWORD PTR [rcx+108] + vpbroadcastd ymm4, DWORD PTR [rcx+112] + vpbroadcastd ymm5, DWORD PTR [rcx+116] + vpbroadcastd ymm6, DWORD PTR [rcx+120] + vpbroadcastd ymm7, DWORD PTR [rcx+124] + vpsrlvd ymm0, ymm0, ymm9 + vpsrlvd ymm1, ymm1, ymm9 + vpsrlvd ymm2, ymm2, ymm9 + vpsrlvd ymm3, ymm3, ymm9 + vpsrlvd ymm4, ymm4, ymm9 + vpsrlvd ymm5, ymm5, ymm9 + vpsrlvd ymm6, ymm6, ymm9 + vpsrlvd ymm7, ymm7, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm10 + vpand ymm3, ymm3, ymm10 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm10 + vpand ymm7, ymm7, ymm10 + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsubd ymm4, ymm8, ymm4 + vpsubd ymm5, ymm8, ymm5 + vpsubd ymm6, ymm8, ymm6 + vpsubd ymm7, ymm8, ymm7 + vmovdqu YMMWORD PTR [rdx+768], ymm0 + vmovdqu YMMWORD PTR [rdx+800], ymm1 + vmovdqu YMMWORD PTR [rdx+832], ymm2 + vmovdqu YMMWORD PTR [rdx+864], ymm3 + vmovdqu YMMWORD PTR [rdx+896], ymm4 + vmovdqu YMMWORD PTR [rdx+928], ymm5 + vmovdqu YMMWORD PTR [rdx+960], ymm6 + vmovdqu YMMWORD PTR [rdx+992], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +wc_mldsa_decode_eta_4_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_w1_88_avx2_shuff_0_even BYTE 00h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 00h, 09h, 0ah, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_w1_88_avx2_shuff_0_even QWORD L_mldsa_encode_w1_88_avx2_shuff_0_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_w1_88_avx2_shuff_0_odd BYTE 04h, 05h, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 04h, 05h, 0eh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_w1_88_avx2_shuff_0_odd QWORD L_mldsa_encode_w1_88_avx2_shuff_0_odd +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_w1_88_avx2_shuff_1_even BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 09h + BYTE 0ah, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 00h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_w1_88_avx2_shuff_1_even QWORD L_mldsa_encode_w1_88_avx2_shuff_1_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_w1_88_avx2_shuff_1_odd BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 04h, 05h + BYTE 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 04h, 05h, 0eh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_w1_88_avx2_shuff_1_odd QWORD L_mldsa_encode_w1_88_avx2_shuff_1_odd +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_w1_88_avx2_vs DWORD 00000000h, 00000006h, 0000000ch, 00000012h + DWORD 00000000h, 00000006h, 0000000ch, 00000012h +ptr_L_mldsa_encode_w1_88_avx2_vs QWORD L_mldsa_encode_w1_88_avx2_vs +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_encode_w1_88_avx2 PROC + sub rsp, 48 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vpxor ymm4, ymm4, ymm4 + vmovdqu ymm4, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_0_even + vmovdqu ymm5, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_0_odd + vmovdqu ymm6, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_1_even + vmovdqu ymm7, YMMWORD PTR L_mldsa_encode_w1_88_avx2_shuff_1_odd + vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_w1_88_avx2_vs + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+96] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+192] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+320] + vmovdqu ymm1, YMMWORD PTR [rcx+352] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+448] + vmovdqu ymm1, YMMWORD PTR [rcx+480] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+576] + vmovdqu ymm1, YMMWORD PTR [rcx+608] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+704] + vmovdqu ymm1, YMMWORD PTR [rcx+736] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+832] + vmovdqu ymm1, YMMWORD PTR [rcx+864] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vmovdqu ymm0, YMMWORD PTR [rcx+960] + vmovdqu ymm1, YMMWORD PTR [rcx+992] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpshufb ymm2, ymm0, ymm5 + vpshufb ymm0, ymm0, ymm4 + vpshufb ymm3, ymm1, ymm7 + vpshufb ymm1, ymm1, ymm6 + vpor ymm0, ymm0, ymm2 + vpor ymm0, ymm0, ymm3 + vpor ymm0, ymm0, ymm1 + vextracti128 xmm2, ymm0, 1 + vpor ymm0, ymm0, ymm2 + vmovq QWORD PTR [rdx], xmm0 + vpextrd DWORD PTR [rdx+8], xmm0, 2 + add rdx, 12 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + ret +wc_mldsa_encode_w1_88_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_w1_32_avx2_vs_4 DWORD 00000000h, 00000004h, 00000000h, 00000004h + DWORD 00000000h, 00000004h, 00000000h, 00000004h +ptr_L_mldsa_encode_w1_32_avx2_vs_4 QWORD L_mldsa_encode_w1_32_avx2_vs_4 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_encode_w1_32_avx2 PROC + sub rsp, 48 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_w1_32_avx2_vs_4 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpsllvd ymm4, ymm4, ymm8 + vpsllvd ymm5, ymm5, ymm8 + vpsllvd ymm6, ymm6, ymm8 + vpsllvd ymm7, ymm7, ymm8 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpsllvd ymm4, ymm4, ymm8 + vpsllvd ymm5, ymm5, ymm8 + vpsllvd ymm6, ymm6, ymm8 + vpsllvd ymm7, ymm7, ymm8 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+32], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpsllvd ymm4, ymm4, ymm8 + vpsllvd ymm5, ymm5, ymm8 + vpsllvd ymm6, ymm6, ymm8 + vpsllvd ymm7, ymm7, ymm8 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vpsllvd ymm0, ymm0, ymm8 + vpsllvd ymm1, ymm1, ymm8 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpsllvd ymm4, ymm4, ymm8 + vpsllvd ymm5, ymm5, ymm8 + vpsllvd ymm6, ymm6, ymm8 + vpsllvd ymm7, ymm7, ymm8 + vpackusdw ymm0, ymm0, ymm1 + vpackusdw ymm1, ymm2, ymm3 + vpackusdw ymm2, ymm4, ymm5 + vpackusdw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpermq ymm2, ymm2, 216 + vpermq ymm3, ymm3, 216 + vphaddw ymm0, ymm0, ymm1 + vphaddw ymm1, ymm2, ymm3 + vphaddw ymm2, ymm4, ymm5 + vphaddw ymm3, ymm6, ymm7 + vpermq ymm0, ymm0, 216 + vpermq ymm1, ymm1, 216 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 216 + vmovdqu YMMWORD PTR [rdx+96], ymm0 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + ret +wc_mldsa_encode_w1_32_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_d_max_half_m1 DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh + DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh +ptr_L_mldsa_encode_t0_t1_avx2_d_max_half_m1 QWORD L_mldsa_encode_t0_t1_avx2_d_max_half_m1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_d_max_half DWORD 00001000h, 00001000h, 00001000h, 00001000h + DWORD 00001000h, 00001000h, 00001000h, 00001000h +ptr_L_mldsa_encode_t0_t1_avx2_d_max_half QWORD L_mldsa_encode_t0_t1_avx2_d_max_half +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_vs_13 DWORD 00000000h, 0000000dh, 00000002h, 0000000fh + DWORD 00000004h, 00000011h, 00000006h, 00000013h +ptr_L_mldsa_encode_t0_t1_avx2_vs_13 QWORD L_mldsa_encode_t0_t1_avx2_vs_13 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_shuff_13_even BYTE 00h, 01h, 0ffh, 08h, 09h, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 01h + BYTE 02h, 08h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_t0_t1_avx2_shuff_13_even QWORD L_mldsa_encode_t0_t1_avx2_shuff_13_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_shuff_13_odd BYTE 0ffh, 05h, 06h, 07h, 0dh, 0eh, 0fh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 06h, 07h, 0ffh, 0eh, 0fh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_t0_t1_avx2_shuff_13_odd QWORD L_mldsa_encode_t0_t1_avx2_shuff_13_odd +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_vs_10 DWORD 00000000h, 0000000ah, 00000004h, 0000000eh + DWORD 00000000h, 0000000ah, 00000004h, 0000000eh +ptr_L_mldsa_encode_t0_t1_avx2_vs_10 QWORD L_mldsa_encode_t0_t1_avx2_vs_10 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_shuff_10_even BYTE 00h, 01h, 08h, 09h, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 00h, 01h, 08h + BYTE 09h, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_t0_t1_avx2_shuff_10_even QWORD L_mldsa_encode_t0_t1_avx2_shuff_10_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_t0_t1_avx2_shuff_10_odd BYTE 0ffh, 05h, 06h, 0dh, 0eh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 05h, 06h + BYTE 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_t0_t1_avx2_shuff_10_odd QWORD L_mldsa_encode_t0_t1_avx2_shuff_10_odd +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_vec_encode_t0_t1_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu ymm6, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_d_max_half_m1 + vmovdqu ymm7, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_d_max_half + vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_vs_13 + vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_13_even + vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_13_odd + vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_vs_10 + vmovdqu ymm12, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_10_even + vmovdqu ymm13, YMMWORD PTR L_mldsa_encode_t0_t1_avx2_shuff_10_odd +L_mldsa_encode_t0_t1_avx2_loop: + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+96] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+192] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+320] + vmovdqu ymm1, YMMWORD PTR [rcx+352] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+448] + vmovdqu ymm1, YMMWORD PTR [rcx+480] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+576] + vmovdqu ymm1, YMMWORD PTR [rcx+608] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+704] + vmovdqu ymm1, YMMWORD PTR [rcx+736] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+832] + vmovdqu ymm1, YMMWORD PTR [rcx+864] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + vmovdqu ymm0, YMMWORD PTR [rcx+960] + vmovdqu ymm1, YMMWORD PTR [rcx+992] + vpaddd ymm4, ymm0, ymm6 + vpaddd ymm5, ymm1, ymm6 + vpsrld ymm4, ymm4, 13 + vpsrld ymm5, ymm5, 13 + vpslld ymm2, ymm4, 13 + vpslld ymm3, ymm5, 13 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm7, ymm2 + vpsubd ymm3, ymm7, ymm3 + vpsllvd ymm2, ymm2, ymm8 + vpsllvd ymm3, ymm3, ymm8 + vpshufb ymm0, ymm2, ymm10 + vpshufb ymm1, ymm3, ymm10 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vextracti128 xmm0, ymm2, 1 + vextracti128 xmm1, ymm3, 1 + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu OWORD PTR [r8], xmm2 + add r8, 13 + vmovdqu OWORD PTR [r8], xmm3 + add r8, 13 + vpsllvd ymm4, ymm4, ymm11 + vpsllvd ymm5, ymm5, ymm11 + vpshufb ymm0, ymm4, ymm13 + vpshufb ymm1, ymm5, ymm13 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vextracti128 xmm0, ymm4, 1 + vextracti128 xmm1, ymm5, 1 + vpor ymm4, ymm4, ymm0 + vpor ymm5, ymm5, ymm1 + vmovdqu OWORD PTR [r9], xmm4 + add r9, 10 + vmovdqu OWORD PTR [r9], xmm5 + add r9, 10 + add rcx, 1024 + dec dl + jnz L_mldsa_encode_t0_t1_avx2_loop + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +wc_mldsa_vec_encode_t0_t1_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_0 BYTE 00h, 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 03h + BYTE 03h, 04h, 05h, 0ffh, 04h, 05h, 06h, 07h + BYTE 06h, 07h, 08h, 0ffh, 0ffh, 08h, 09h, 0ffh + BYTE 09h, 0ah, 0bh, 0ffh, 0ffh, 0ffh, 0bh, 0ch +ptr_L_mldsa_decode_t0_avx2_shuff_0 QWORD L_mldsa_decode_t0_avx2_shuff_0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_1 BYTE 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 08h + BYTE 08h, 09h, 0ffh, 0ffh, 09h, 0ah, 0bh, 0ffh + BYTE 03h, 04h, 05h, 0ffh, 0ffh, 05h, 06h, 0ffh + BYTE 06h, 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h +ptr_L_mldsa_decode_t0_avx2_shuff_1 QWORD L_mldsa_decode_t0_avx2_shuff_1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_2 BYTE 02h, 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 05h + BYTE 05h, 06h, 0ffh, 0ffh, 06h, 07h, 08h, 0ffh + BYTE 00h, 01h, 02h, 0ffh, 0ffh, 02h, 03h, 0ffh + BYTE 03h, 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h +ptr_L_mldsa_decode_t0_avx2_shuff_2 QWORD L_mldsa_decode_t0_avx2_shuff_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_3 BYTE 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ah + BYTE 0ah, 0bh, 0ffh, 0ffh, 0bh, 0ch, 0dh, 0ffh + BYTE 05h, 06h, 07h, 0ffh, 0ffh, 07h, 08h, 0ffh + BYTE 08h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ah, 0bh +ptr_L_mldsa_decode_t0_avx2_shuff_3 QWORD L_mldsa_decode_t0_avx2_shuff_3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_4 BYTE 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 07h + BYTE 07h, 08h, 0ffh, 0ffh, 08h, 09h, 0ah, 0ffh + BYTE 02h, 03h, 04h, 0ffh, 0ffh, 04h, 05h, 0ffh + BYTE 05h, 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h +ptr_L_mldsa_decode_t0_avx2_shuff_4 QWORD L_mldsa_decode_t0_avx2_shuff_4 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_5 BYTE 01h, 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 04h + BYTE 04h, 05h, 0ffh, 0ffh, 05h, 06h, 07h, 0ffh + BYTE 07h, 08h, 09h, 0ffh, 0ffh, 09h, 0ah, 0ffh + BYTE 0ah, 0bh, 0ch, 0ffh, 0ffh, 0ffh, 0ch, 0dh +ptr_L_mldsa_decode_t0_avx2_shuff_5 QWORD L_mldsa_decode_t0_avx2_shuff_5 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_6 BYTE 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 09h + BYTE 09h, 0ah, 0ffh, 0ffh, 0ah, 0bh, 0ch, 0ffh + BYTE 04h, 05h, 06h, 0ffh, 0ffh, 06h, 07h, 08h + BYTE 07h, 08h, 09h, 0ffh, 0ffh, 0ffh, 09h, 0ah +ptr_L_mldsa_decode_t0_avx2_shuff_6 QWORD L_mldsa_decode_t0_avx2_shuff_6 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_shuff_7 BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 06h + BYTE 06h, 07h, 0ffh, 0ffh, 07h, 08h, 09h, 0ffh + BYTE 01h, 02h, 03h, 0ffh, 0ffh, 03h, 04h, 0ffh + BYTE 04h, 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h +ptr_L_mldsa_decode_t0_avx2_shuff_7 QWORD L_mldsa_decode_t0_avx2_shuff_7 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_vs_8 DWORD 00000000h, 0000000dh, 00000002h, 00000007h + DWORD 00000004h, 00000009h, 00000006h, 00000013h +ptr_L_mldsa_decode_t0_avx2_vs_8 QWORD L_mldsa_decode_t0_avx2_vs_8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_mask DWORD 00001fffh, 00001fffh, 00001fffh, 00001fffh + DWORD 00001fffh, 00001fffh, 00001fffh, 00001fffh +ptr_L_mldsa_decode_t0_avx2_mask QWORD L_mldsa_decode_t0_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t0_avx2_d_max_half DWORD 00001000h, 00001000h, 00001000h, 00001000h + DWORD 00001000h, 00001000h, 00001000h, 00001000h +ptr_L_mldsa_decode_t0_avx2_d_max_half QWORD L_mldsa_decode_t0_avx2_d_max_half +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decode_t0_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vpxor ymm5, ymm5, ymm5 + vmovdqu ymm5, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_0 + vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_1 + vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_2 + vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_3 + vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_4 + vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_5 + vmovdqu ymm11, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_6 + vmovdqu ymm12, YMMWORD PTR L_mldsa_decode_t0_avx2_shuff_7 + vmovdqu ymm13, YMMWORD PTR L_mldsa_decode_t0_avx2_vs_8 + vmovdqu ymm14, YMMWORD PTR L_mldsa_decode_t0_avx2_mask + vmovdqu ymm15, YMMWORD PTR L_mldsa_decode_t0_avx2_d_max_half + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; 1/32 + vpermq ymm4, ymm0, 68 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx], ymm4 + ; 2/32 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + ; 3/32 + vperm2i128 ymm0, ymm0, ymm1, 33 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+64], ymm4 + ; 4/32 + vpermq ymm4, ymm1, 148 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+96], ymm4 + ; 5/32 + vperm2i128 ymm1, ymm1, ymm2, 33 + vpermq ymm4, ymm1, 148 + vpshufb ymm4, ymm4, ymm9 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + ; 6/32 + vpermq ymm4, ymm2, 68 + vpshufb ymm4, ymm4, ymm10 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+160], ymm4 + ; 7/32 + vpermq ymm4, ymm2, 233 + vpshufb ymm4, ymm4, ymm11 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+192], ymm4 + ; 8/32 + vperm2i128 ymm2, ymm2, ymm3, 33 + vpermq ymm4, ymm2, 233 + vpshufb ymm4, ymm4, ymm12 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+224], ymm4 + ; 9/32 + vpermq ymm4, ymm3, 153 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+256], ymm4 + ; 10/32 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vperm2i128 ymm3, ymm3, ymm0, 33 + vpermq ymm4, ymm3, 148 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+288], ymm4 + ; 11/32 + vpermq ymm4, ymm0, 148 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+320], ymm4 + ; 12/32 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+352], ymm4 + ; 13/32 + vperm2i128 ymm0, ymm0, ymm1, 33 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm9 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + ; 14/32 + vpermq ymm4, ymm1, 153 + vpshufb ymm4, ymm4, ymm10 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+416], ymm4 + ; 15/32 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vperm2i128 ymm1, ymm1, ymm2, 33 + vpermq ymm4, ymm1, 148 + vpshufb ymm4, ymm4, ymm11 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+448], ymm4 + ; 16/32 + vpermq ymm4, ymm2, 148 + vpshufb ymm4, ymm4, ymm12 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+480], ymm4 + ; 17/32 + vpermq ymm4, ymm2, 238 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+512], ymm4 + ; 18/32 + vperm2i128 ymm2, ymm2, ymm3, 33 + vpermq ymm4, ymm2, 233 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+544], ymm4 + ; 19/32 + vpermq ymm4, ymm3, 233 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+576], ymm4 + ; 20/32 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vperm2i128 ymm3, ymm3, ymm0, 33 + vpermq ymm4, ymm3, 148 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+608], ymm4 + ; 21/32 + vpermq ymm4, ymm0, 148 + vpshufb ymm4, ymm4, ymm9 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+640], ymm4 + ; 22/32 + vpermq ymm4, ymm0, 238 + vpshufb ymm4, ymm4, ymm10 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+672], ymm4 + ; 23/32 + vperm2i128 ymm0, ymm0, ymm1, 33 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm11 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+704], ymm4 + ; 24/32 + vpermq ymm4, ymm1, 233 + vpshufb ymm4, ymm4, ymm12 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+736], ymm4 + ; 25/32 + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vperm2i128 ymm1, ymm1, ymm2, 33 + vpermq ymm4, ymm1, 153 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+768], ymm4 + ; 26/32 + vpermq ymm4, ymm2, 148 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+800], ymm4 + ; 27/32 + vpermq ymm4, ymm2, 62 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+832], ymm4 + ; 28/32 + vperm2i128 ymm2, ymm2, ymm3, 33 + vpermq ymm4, ymm2, 233 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+864], ymm4 + ; 29/32 + vpermq ymm4, ymm3, 233 + vpshufb ymm4, ymm4, ymm9 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+896], ymm4 + ; 30/32 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vperm2i128 ymm3, ymm3, ymm0, 33 + vpermq ymm4, ymm3, 153 + vpshufb ymm4, ymm4, ymm10 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+928], ymm4 + ; 31/32 + vpermq ymm4, ymm0, 148 + vpshufb ymm4, ymm4, ymm11 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+960], ymm4 + ; 32/32 + vpermq ymm4, ymm0, 62 + vpshufb ymm4, ymm4, ymm12 + vpsrlvd ymm4, ymm4, ymm13 + vpand ymm4, ymm4, ymm14 + vpsubd ymm4, ymm15, ymm4 + vmovdqu YMMWORD PTR [rdx+992], ymm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_decode_t0_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t1_avx2_shuff_0 BYTE 00h, 01h, 0ffh, 0ffh, 0ffh, 01h, 02h, 0ffh + BYTE 02h, 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh + BYTE 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh + BYTE 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh +ptr_L_mldsa_decode_t1_avx2_shuff_0 QWORD L_mldsa_decode_t1_avx2_shuff_0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t1_avx2_shuff_1 BYTE 02h, 03h, 0ffh, 0ffh, 0ffh, 03h, 04h, 0ffh + BYTE 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh + BYTE 07h, 08h, 0ffh, 0ffh, 0ffh, 08h, 09h, 0ffh + BYTE 09h, 0ah, 08h, 0ffh, 0ffh, 0ah, 0bh, 0ffh +ptr_L_mldsa_decode_t1_avx2_shuff_1 QWORD L_mldsa_decode_t1_avx2_shuff_1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t1_avx2_shuff_2 BYTE 04h, 05h, 0ffh, 0ffh, 0ffh, 05h, 06h, 0ffh + BYTE 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh + BYTE 01h, 02h, 0ffh, 0ffh, 0ffh, 02h, 03h, 0ffh + BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh +ptr_L_mldsa_decode_t1_avx2_shuff_2 QWORD L_mldsa_decode_t1_avx2_shuff_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t1_avx2_shuff_3 BYTE 06h, 07h, 0ffh, 0ffh, 0ffh, 07h, 08h, 0ffh + BYTE 08h, 09h, 0ffh, 0ffh, 0ffh, 09h, 0ah, 0ffh + BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 04h, 05h, 0ffh + BYTE 05h, 06h, 0ffh, 0ffh, 0ffh, 06h, 07h, 0ffh +ptr_L_mldsa_decode_t1_avx2_shuff_3 QWORD L_mldsa_decode_t1_avx2_shuff_3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t1_avx2_vs_8 DWORD 00000000h, 0000000ah, 00000004h, 0000000eh + DWORD 00000000h, 0000000ah, 00000004h, 0000000eh +ptr_L_mldsa_decode_t1_avx2_vs_8 QWORD L_mldsa_decode_t1_avx2_vs_8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_t1_avx2_mask DWORD 000003ffh, 000003ffh, 000003ffh, 000003ffh + DWORD 000003ffh, 000003ffh, 000003ffh, 000003ffh +ptr_L_mldsa_decode_t1_avx2_mask QWORD L_mldsa_decode_t1_avx2_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decode_t1_avx2 PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vpxor ymm5, ymm5, ymm5 + vmovdqu ymm5, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_0 + vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_1 + vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_2 + vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_t1_avx2_shuff_3 + vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_t1_avx2_vs_8 + vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_t1_avx2_mask + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + ; 1/32 + vpermq ymm4, ymm0, 68 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx], ymm4 + ; 2/32 + vpermq ymm4, ymm0, 153 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + ; 3/32 + vpermq ymm4, ymm0, 62 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+64], ymm4 + ; 4/32 + vperm2i128 ymm0, ymm0, ymm1, 33 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+96], ymm4 + ; 5/32 + vpermq ymm4, ymm1, 153 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + ; 6/32 + vpermq ymm4, ymm1, 238 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+160], ymm4 + ; 7/32 + vperm2i128 ymm1, ymm1, ymm2, 33 + vpermq ymm4, ymm1, 233 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+192], ymm4 + ; 8/32 + vpermq ymm4, ymm2, 148 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+224], ymm4 + ; 9/32 + vpermq ymm4, ymm2, 238 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+256], ymm4 + ; 10/32 + vperm2i128 ymm2, ymm2, ymm3, 33 + vpermq ymm4, ymm2, 153 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+288], ymm4 + ; 11/32 + vpermq ymm4, ymm3, 148 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+320], ymm4 + ; 12/32 + vpermq ymm4, ymm3, 233 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+352], ymm4 + ; 13/32 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vperm2i128 ymm3, ymm3, ymm0, 33 + vpermq ymm4, ymm3, 153 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + ; 14/32 + vpermq ymm4, ymm0, 68 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+416], ymm4 + ; 15/32 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+448], ymm4 + ; 16/32 + vpermq ymm4, ymm0, 62 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+480], ymm4 + ; 17/32 + vpermq ymm4, ymm1, 68 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+512], ymm4 + ; 18/32 + vpermq ymm4, ymm1, 153 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+544], ymm4 + ; 19/32 + vpermq ymm4, ymm1, 62 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+576], ymm4 + ; 20/32 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vperm2i128 ymm1, ymm1, ymm2, 33 + vpermq ymm4, ymm1, 233 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+608], ymm4 + ; 21/32 + vpermq ymm4, ymm2, 153 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+640], ymm4 + ; 22/32 + vpermq ymm4, ymm2, 238 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+672], ymm4 + ; 23/32 + vperm2i128 ymm2, ymm2, ymm3, 33 + vpermq ymm4, ymm2, 233 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+704], ymm4 + ; 24/32 + vpermq ymm4, ymm3, 148 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+736], ymm4 + ; 25/32 + vpermq ymm4, ymm3, 238 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+768], ymm4 + ; 26/32 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vperm2i128 ymm3, ymm3, ymm0, 33 + vpermq ymm4, ymm3, 153 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+800], ymm4 + ; 27/32 + vpermq ymm4, ymm0, 148 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+832], ymm4 + ; 28/32 + vpermq ymm4, ymm0, 233 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+864], ymm4 + ; 29/32 + vperm2i128 ymm0, ymm0, ymm1, 33 + vpermq ymm4, ymm0, 153 + vpshufb ymm4, ymm4, ymm5 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+896], ymm4 + ; 30/32 + vpermq ymm4, ymm1, 68 + vpshufb ymm4, ymm4, ymm6 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+928], ymm4 + ; 31/32 + vpermq ymm4, ymm1, 233 + vpshufb ymm4, ymm4, ymm7 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+960], ymm4 + ; 32/32 + vpermq ymm4, ymm1, 62 + vpshufb ymm4, ymm4, ymm8 + vpsrlvd ymm4, ymm4, ymm9 + vpand ymm4, ymm4, ymm10 + vpslld ymm4, ymm4, 13 + vmovdqu YMMWORD PTR [rdx+992], ymm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +wc_mldsa_decode_t1_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_17_avx2_shuff_0 BYTE 00h, 01h, 02h, 0ffh, 02h, 03h, 04h, 0ffh + BYTE 04h, 05h, 06h, 0ffh, 06h, 07h, 08h, 0ffh + BYTE 0ffh, 01h, 02h, 03h, 0ffh, 03h, 04h, 05h + BYTE 0ffh, 05h, 06h, 07h, 0ffh, 07h, 08h, 09h +ptr_L_mldsa_decode_gamma1_17_avx2_shuff_0 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_17_avx2_shuff_1 BYTE 02h, 03h, 04h, 0ffh, 04h, 05h, 06h, 0ffh + BYTE 06h, 07h, 08h, 0ffh, 08h, 09h, 0ah, 0ffh + BYTE 0ffh, 03h, 04h, 05h, 0ffh, 05h, 06h, 07h + BYTE 0ffh, 07h, 08h, 09h, 0ffh, 09h, 0ah, 0bh +ptr_L_mldsa_decode_gamma1_17_avx2_shuff_1 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_17_avx2_shuff_2 BYTE 04h, 05h, 06h, 0ffh, 06h, 07h, 08h, 0ffh + BYTE 08h, 09h, 0ah, 0ffh, 0ah, 0bh, 0ch, 0ffh + BYTE 0ffh, 05h, 06h, 07h, 0ffh, 07h, 08h, 09h + BYTE 0ffh, 09h, 0ah, 0bh, 0ffh, 0bh, 0ch, 0dh +ptr_L_mldsa_decode_gamma1_17_avx2_shuff_2 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_17_avx2_shuff_3 BYTE 06h, 07h, 08h, 0ffh, 08h, 09h, 0ah, 0ffh + BYTE 0ah, 0bh, 0ch, 0ffh, 0ch, 0dh, 0eh, 0ffh + BYTE 0ffh, 07h, 08h, 09h, 0ffh, 09h, 0ah, 0bh + BYTE 0ffh, 0bh, 0ch, 0dh, 0ffh, 0dh, 0eh, 0fh +ptr_L_mldsa_decode_gamma1_17_avx2_shuff_3 QWORD L_mldsa_decode_gamma1_17_avx2_shuff_3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_17_avx2_vs_8 DWORD 00000000h, 00000002h, 00000004h, 00000006h + DWORD 00000008h, 0000000ah, 0000000ch, 0000000eh +ptr_L_mldsa_decode_gamma1_17_avx2_vs_8 QWORD L_mldsa_decode_gamma1_17_avx2_vs_8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_17_avx2_mask DWORD 0003ffffh, 0003ffffh, 0003ffffh, 0003ffffh + DWORD 0003ffffh, 0003ffffh, 0003ffffh, 0003ffffh +ptr_L_mldsa_decode_gamma1_17_avx2_mask QWORD L_mldsa_decode_gamma1_17_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_17_avx2_gamma17 DWORD 00020000h, 00020000h, 00020000h, 00020000h + DWORD 00020000h, 00020000h, 00020000h, 00020000h +ptr_L_mldsa_decode_gamma1_17_avx2_gamma17 QWORD L_mldsa_decode_gamma1_17_avx2_gamma17 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decode_gamma1_17_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_0 + vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_1 + vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_2 + vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_shuff_3 + vmovdqu ymm11, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_vs_8 + vmovdqu ymm12, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_mask + vmovdqu ymm13, YMMWORD PTR L_mldsa_decode_gamma1_17_avx2_gamma17 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + ; 0/15 + vpermq ymm6, ymm0, 148 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx], ymm6 + ; 1/15 + vperm2i128 ymm6, ymm0, ymm1, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+32], ymm6 + ; 2/15 + vpermq ymm6, ymm1, 148 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+64], ymm6 + ; 3/15 + vperm2i128 ymm6, ymm1, ymm2, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+96], ymm6 + ; 4/15 + vpermq ymm6, ymm2, 233 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+128], ymm6 + ; 5/15 + vperm2i128 ymm6, ymm2, ymm3, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+160], ymm6 + ; 6/15 + vpermq ymm6, ymm3, 233 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+192], ymm6 + ; 7/15 + vperm2i128 ymm6, ymm3, ymm4, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+224], ymm6 + ; 8/15 + vperm2i128 ymm6, ymm4, ymm5, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+256], ymm6 + ; 9/15 + vpermq ymm6, ymm5, 148 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+288], ymm6 + vmovdqu ymm0, YMMWORD PTR [rcx+192] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vmovdqu ymm2, YMMWORD PTR [rcx+256] + ; 10/15 + vperm2i128 ymm6, ymm5, ymm0, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+320], ymm6 + ; 11/15 + vpermq ymm6, ymm0, 148 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+352], ymm6 + ; 12/15 + vperm2i128 ymm6, ymm0, ymm1, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+384], ymm6 + ; 13/15 + vpermq ymm6, ymm1, 233 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+416], ymm6 + ; 14/15 + vperm2i128 ymm6, ymm1, ymm2, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+448], ymm6 + ; 15/15 + vpermq ymm6, ymm2, 233 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+480], ymm6 + vmovdqu ymm0, YMMWORD PTR [rcx+288] + vmovdqu ymm1, YMMWORD PTR [rcx+320] + vmovdqu ymm2, YMMWORD PTR [rcx+352] + vmovdqu ymm3, YMMWORD PTR [rcx+384] + vmovdqu ymm4, YMMWORD PTR [rcx+416] + vmovdqu ymm5, YMMWORD PTR [rcx+448] + ; 0/15 + vpermq ymm6, ymm0, 148 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+512], ymm6 + ; 1/15 + vperm2i128 ymm6, ymm0, ymm1, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+544], ymm6 + ; 2/15 + vpermq ymm6, ymm1, 148 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+576], ymm6 + ; 3/15 + vperm2i128 ymm6, ymm1, ymm2, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+608], ymm6 + ; 4/15 + vpermq ymm6, ymm2, 233 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+640], ymm6 + ; 5/15 + vperm2i128 ymm6, ymm2, ymm3, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+672], ymm6 + ; 6/15 + vpermq ymm6, ymm3, 233 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+704], ymm6 + ; 7/15 + vperm2i128 ymm6, ymm3, ymm4, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+736], ymm6 + ; 8/15 + vperm2i128 ymm6, ymm4, ymm5, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+768], ymm6 + ; 9/15 + vpermq ymm6, ymm5, 148 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+800], ymm6 + vmovdqu ymm0, YMMWORD PTR [rcx+480] + vmovdqu ymm1, YMMWORD PTR [rcx+512] + vmovdqu ymm2, YMMWORD PTR [rcx+544] + ; 10/15 + vperm2i128 ymm6, ymm5, ymm0, 33 + vpermq ymm6, ymm6, 148 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+832], ymm6 + ; 11/15 + vpermq ymm6, ymm0, 148 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+864], ymm6 + ; 12/15 + vperm2i128 ymm6, ymm0, ymm1, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm7 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+896], ymm6 + ; 13/15 + vpermq ymm6, ymm1, 233 + vpshufb ymm6, ymm6, ymm8 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+928], ymm6 + ; 14/15 + vperm2i128 ymm6, ymm1, ymm2, 33 + vpermq ymm6, ymm6, 233 + vpshufb ymm6, ymm6, ymm9 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+960], ymm6 + ; 15/15 + vpermq ymm6, ymm2, 233 + vpshufb ymm6, ymm6, ymm10 + vpsrlvd ymm6, ymm6, ymm11 + vpand ymm6, ymm6, ymm12 + vpsubd ymm6, ymm13, ymm6 + vmovdqu YMMWORD PTR [rdx+992], ymm6 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +wc_mldsa_decode_gamma1_17_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_20_avx2_shuff_0 BYTE 00h, 01h, 02h, 0ffh, 02h, 03h, 04h, 0ffh + BYTE 05h, 06h, 07h, 0ffh, 07h, 08h, 09h, 0ffh + BYTE 0ffh, 02h, 03h, 04h, 0ffh, 04h, 05h, 06h + BYTE 0ffh, 07h, 08h, 09h, 0ffh, 09h, 0ah, 0bh +ptr_L_mldsa_decode_gamma1_20_avx2_shuff_0 QWORD L_mldsa_decode_gamma1_20_avx2_shuff_0 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_20_avx2_shuff_1 BYTE 04h, 05h, 06h, 0ffh, 06h, 07h, 08h, 0ffh + BYTE 09h, 0ah, 0bh, 0ffh, 0bh, 0ch, 0dh, 0ffh + BYTE 0ffh, 06h, 07h, 08h, 0ffh, 08h, 09h, 0ah + BYTE 0ffh, 0bh, 0ch, 0dh, 0ffh, 0dh, 0eh, 0fh +ptr_L_mldsa_decode_gamma1_20_avx2_shuff_1 QWORD L_mldsa_decode_gamma1_20_avx2_shuff_1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_20_avx2_vs_8 DWORD 00000000h, 00000004h, 00000000h, 00000004h + DWORD 00000008h, 0000000ch, 00000008h, 0000000ch +ptr_L_mldsa_decode_gamma1_20_avx2_vs_8 QWORD L_mldsa_decode_gamma1_20_avx2_vs_8 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_20_avx2_mask DWORD 000fffffh, 000fffffh, 000fffffh, 000fffffh + DWORD 000fffffh, 000fffffh, 000fffffh, 000fffffh +ptr_L_mldsa_decode_gamma1_20_avx2_mask QWORD L_mldsa_decode_gamma1_20_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decode_gamma1_20_avx2_gamma19 DWORD 00080000h, 00080000h, 00080000h, 00080000h + DWORD 00080000h, 00080000h, 00080000h, 00080000h +ptr_L_mldsa_decode_gamma1_20_avx2_gamma19 QWORD L_mldsa_decode_gamma1_20_avx2_gamma19 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decode_gamma1_19_avx2 PROC + sub rsp, 80 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu ymm6, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_shuff_0 + vmovdqu ymm7, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_shuff_1 + vmovdqu ymm8, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_vs_8 + vmovdqu ymm9, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_mask + vmovdqu ymm10, YMMWORD PTR L_mldsa_decode_gamma1_20_avx2_gamma19 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + ; 0/7 + vpermq ymm5, ymm0, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx], ymm5 + ; 1/7 + vperm2i128 ymm5, ymm0, ymm1, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+32], ymm5 + ; 2/7 + vpermq ymm5, ymm1, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm5 + ; 3/7 + vperm2i128 ymm5, ymm1, ymm2, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+96], ymm5 + ; 4/7 + vperm2i128 ymm5, ymm2, ymm3, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+128], ymm5 + ; 5/7 + vpermq ymm5, ymm3, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+160], ymm5 + ; 6/7 + vperm2i128 ymm5, ymm3, ymm4, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+192], ymm5 + ; 7/7 + vpermq ymm5, ymm4, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+224], ymm5 + vmovdqu ymm0, YMMWORD PTR [rcx+160] + vmovdqu ymm1, YMMWORD PTR [rcx+192] + vmovdqu ymm2, YMMWORD PTR [rcx+224] + vmovdqu ymm3, YMMWORD PTR [rcx+256] + vmovdqu ymm4, YMMWORD PTR [rcx+288] + ; 0/7 + vpermq ymm5, ymm0, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+256], ymm5 + ; 1/7 + vperm2i128 ymm5, ymm0, ymm1, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+288], ymm5 + ; 2/7 + vpermq ymm5, ymm1, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+320], ymm5 + ; 3/7 + vperm2i128 ymm5, ymm1, ymm2, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+352], ymm5 + ; 4/7 + vperm2i128 ymm5, ymm2, ymm3, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+384], ymm5 + ; 5/7 + vpermq ymm5, ymm3, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+416], ymm5 + ; 6/7 + vperm2i128 ymm5, ymm3, ymm4, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+448], ymm5 + ; 7/7 + vpermq ymm5, ymm4, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+480], ymm5 + vmovdqu ymm0, YMMWORD PTR [rcx+320] + vmovdqu ymm1, YMMWORD PTR [rcx+352] + vmovdqu ymm2, YMMWORD PTR [rcx+384] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vmovdqu ymm4, YMMWORD PTR [rcx+448] + ; 0/7 + vpermq ymm5, ymm0, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+512], ymm5 + ; 1/7 + vperm2i128 ymm5, ymm0, ymm1, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+544], ymm5 + ; 2/7 + vpermq ymm5, ymm1, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+576], ymm5 + ; 3/7 + vperm2i128 ymm5, ymm1, ymm2, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+608], ymm5 + ; 4/7 + vperm2i128 ymm5, ymm2, ymm3, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+640], ymm5 + ; 5/7 + vpermq ymm5, ymm3, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+672], ymm5 + ; 6/7 + vperm2i128 ymm5, ymm3, ymm4, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+704], ymm5 + ; 7/7 + vpermq ymm5, ymm4, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+736], ymm5 + vmovdqu ymm0, YMMWORD PTR [rcx+480] + vmovdqu ymm1, YMMWORD PTR [rcx+512] + vmovdqu ymm2, YMMWORD PTR [rcx+544] + vmovdqu ymm3, YMMWORD PTR [rcx+576] + vmovdqu ymm4, YMMWORD PTR [rcx+608] + ; 0/7 + vpermq ymm5, ymm0, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+768], ymm5 + ; 1/7 + vperm2i128 ymm5, ymm0, ymm1, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+800], ymm5 + ; 2/7 + vpermq ymm5, ymm1, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+832], ymm5 + ; 3/7 + vperm2i128 ymm5, ymm1, ymm2, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+864], ymm5 + ; 4/7 + vperm2i128 ymm5, ymm2, ymm3, 33 + vpermq ymm5, ymm5, 148 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+896], ymm5 + ; 5/7 + vpermq ymm5, ymm3, 148 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+928], ymm5 + ; 6/7 + vperm2i128 ymm5, ymm3, ymm4, 33 + vpermq ymm5, ymm5, 233 + vpshufb ymm5, ymm5, ymm6 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+960], ymm5 + ; 7/7 + vpermq ymm5, ymm4, 233 + vpshufb ymm5, ymm5, ymm7 + vpsrlvd ymm5, ymm5, ymm8 + vpand ymm5, ymm5, ymm9 + vpsubd ymm5, ymm10, ymm5 + vmovdqu YMMWORD PTR [rdx+992], ymm5 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + add rsp, 80 + ret +wc_mldsa_decode_gamma1_19_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_17_avx2_gamma17 DWORD 00020000h, 00020000h, 00020000h, 00020000h + DWORD 00020000h, 00020000h, 00020000h, 00020000h +ptr_L_mldsa_encode_gamma1_17_avx2_gamma17 QWORD L_mldsa_encode_gamma1_17_avx2_gamma17 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_17_avx2_shuff_even BYTE 00h, 01h, 02h, 0ffh, 08h, 09h, 0ah, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 00h, 01h, 02h, 0ffh, 08h, 09h, 0ah, 0ffh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_gamma1_17_avx2_shuff_even QWORD L_mldsa_encode_gamma1_17_avx2_shuff_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_17_avx2_shuff_odd BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ch, 0dh + BYTE 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ch, 0dh + BYTE 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_gamma1_17_avx2_shuff_odd QWORD L_mldsa_encode_gamma1_17_avx2_shuff_odd +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_17_avx2_vs DWORD 00000000h, 00000002h, 00000004h, 00000006h + DWORD 00000000h, 00000002h, 00000004h, 00000006h +ptr_L_mldsa_encode_gamma1_17_avx2_vs QWORD L_mldsa_encode_gamma1_17_avx2_vs +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_encode_gamma1_17_avx2 PROC + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_gamma17 + vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_shuff_even + vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_shuff_odd + vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_gamma1_17_avx2_vs + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vmovdqu ymm2, YMMWORD PTR [rcx+704] + vmovdqu ymm3, YMMWORD PTR [rcx+736] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vmovdqu ymm2, YMMWORD PTR [rcx+960] + vmovdqu ymm3, YMMWORD PTR [rcx+992] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrb BYTE PTR [rdx+8], xmm0, 8 + movq QWORD PTR [rdx+9], xmm4 + vpextrb BYTE PTR [rdx+17], xmm4, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm1 + vpextrb BYTE PTR [rdx+8], xmm1, 8 + movq QWORD PTR [rdx+9], xmm5 + vpextrb BYTE PTR [rdx+17], xmm5, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm2 + vpextrb BYTE PTR [rdx+8], xmm2, 8 + movq QWORD PTR [rdx+9], xmm6 + vpextrb BYTE PTR [rdx+17], xmm6, 8 + add rdx, 18 + movq QWORD PTR [rdx], xmm3 + vpextrb BYTE PTR [rdx+8], xmm3, 8 + movq QWORD PTR [rdx+9], xmm7 + vpextrb BYTE PTR [rdx+17], xmm7, 8 + add rdx, 18 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + ret +wc_mldsa_encode_gamma1_17_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_19_avx2_gamma19 DWORD 00080000h, 00080000h, 00080000h, 00080000h + DWORD 00080000h, 00080000h, 00080000h, 00080000h +ptr_L_mldsa_encode_gamma1_19_avx2_gamma19 QWORD L_mldsa_encode_gamma1_19_avx2_gamma19 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_19_avx2_shuff_even BYTE 00h, 01h, 02h, 0ffh, 0ffh, 08h, 09h, 0ah + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 00h, 01h, 02h, 0ffh, 0ffh, 08h, 09h, 0ah + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_gamma1_19_avx2_shuff_even QWORD L_mldsa_encode_gamma1_19_avx2_shuff_even +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_19_avx2_shuff_odd BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ffh, 0ch + BYTE 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 04h, 05h, 06h, 0ffh, 0ffh, 0ch + BYTE 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh +ptr_L_mldsa_encode_gamma1_19_avx2_shuff_odd QWORD L_mldsa_encode_gamma1_19_avx2_shuff_odd +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_encode_gamma1_19_avx2_vs DWORD 00000000h, 00000004h, 00000000h, 00000004h + DWORD 00000000h, 00000004h, 00000000h, 00000004h +ptr_L_mldsa_encode_gamma1_19_avx2_vs QWORD L_mldsa_encode_gamma1_19_avx2_vs +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_encode_gamma1_19_avx2 PROC + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_gamma19 + vmovdqu ymm9, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_shuff_even + vmovdqu ymm10, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_shuff_odd + vmovdqu ymm11, YMMWORD PTR L_mldsa_encode_gamma1_19_avx2_vs + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vmovdqu ymm2, YMMWORD PTR [rcx+704] + vmovdqu ymm3, YMMWORD PTR [rcx+736] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vmovdqu ymm2, YMMWORD PTR [rcx+960] + vmovdqu ymm3, YMMWORD PTR [rcx+992] + vpsubd ymm0, ymm8, ymm0 + vpsubd ymm1, ymm8, ymm1 + vpsubd ymm2, ymm8, ymm2 + vpsubd ymm3, ymm8, ymm3 + vpsllvd ymm0, ymm0, ymm11 + vpsllvd ymm1, ymm1, ymm11 + vpsllvd ymm2, ymm2, ymm11 + vpsllvd ymm3, ymm3, ymm11 + vpshufb ymm4, ymm0, ymm10 + vpshufb ymm5, ymm1, ymm10 + vpshufb ymm6, ymm2, ymm10 + vpshufb ymm7, ymm3, ymm10 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpshufb ymm2, ymm2, ymm9 + vpshufb ymm3, ymm3, ymm9 + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm5 + vpor ymm2, ymm2, ymm6 + vpor ymm3, ymm3, ymm7 + vextracti128 xmm4, ymm0, 1 + vextracti128 xmm5, ymm1, 1 + vextracti128 xmm6, ymm2, 1 + vextracti128 xmm7, ymm3, 1 + movq QWORD PTR [rdx], xmm0 + vpextrw WORD PTR [rdx+8], xmm0, 4 + movq QWORD PTR [rdx+10], xmm4 + vpextrw WORD PTR [rdx+18], xmm4, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm1 + vpextrw WORD PTR [rdx+8], xmm1, 4 + movq QWORD PTR [rdx+10], xmm5 + vpextrw WORD PTR [rdx+18], xmm5, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm2 + vpextrw WORD PTR [rdx+8], xmm2, 4 + movq QWORD PTR [rdx+10], xmm6 + vpextrw WORD PTR [rdx+18], xmm6, 4 + add rdx, 20 + movq QWORD PTR [rdx], xmm3 + vpextrw WORD PTR [rdx+8], xmm3, 4 + movq QWORD PTR [rdx+10], xmm7 + vpextrw WORD PTR [rdx+18], xmm7, 4 + add rdx, 20 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + ret +wc_mldsa_encode_gamma1_19_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q88_avx2_q_low_88 DWORD 00017400h, 00017400h, 00017400h, 00017400h + DWORD 00017400h, 00017400h, 00017400h, 00017400h +ptr_L_mldsa_decompose_q88_avx2_q_low_88 QWORD L_mldsa_decompose_q88_avx2_q_low_88 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q88_avx2_q_low_88_2 DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h + DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h +ptr_L_mldsa_decompose_q88_avx2_q_low_88_2 QWORD L_mldsa_decompose_q88_avx2_q_low_88_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q88_avx2_q_2 DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h + DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h +ptr_L_mldsa_decompose_q88_avx2_q_2 QWORD L_mldsa_decompose_q88_avx2_q_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q88_avx2_44 DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch + DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch +ptr_L_mldsa_decompose_q88_avx2_44 QWORD L_mldsa_decompose_q88_avx2_44 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decompose_q88_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm12, YMMWORD PTR L_mldsa_decompose_q88_avx2_q_low_88 + vmovdqu ymm13, YMMWORD PTR L_mldsa_decompose_q88_avx2_q_low_88_2 + vmovdqu ymm14, YMMWORD PTR L_mldsa_decompose_q88_avx2_q_2 + vmovdqu ymm15, YMMWORD PTR L_mldsa_decompose_q88_avx2_44 + ; 1/4 vectors + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx], ymm4 + vmovdqu YMMWORD PTR [rdx+32], ymm5 + vmovdqu YMMWORD PTR [rdx+64], ymm6 + vmovdqu YMMWORD PTR [rdx+96], ymm7 + vmovdqu YMMWORD PTR [r8], ymm8 + vmovdqu YMMWORD PTR [r8+32], ymm9 + vmovdqu YMMWORD PTR [r8+64], ymm10 + vmovdqu YMMWORD PTR [r8+96], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + vmovdqu YMMWORD PTR [rdx+160], ymm5 + vmovdqu YMMWORD PTR [rdx+192], ymm6 + vmovdqu YMMWORD PTR [rdx+224], ymm7 + vmovdqu YMMWORD PTR [r8+128], ymm8 + vmovdqu YMMWORD PTR [r8+160], ymm9 + vmovdqu YMMWORD PTR [r8+192], ymm10 + vmovdqu YMMWORD PTR [r8+224], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+256], ymm4 + vmovdqu YMMWORD PTR [rdx+288], ymm5 + vmovdqu YMMWORD PTR [rdx+320], ymm6 + vmovdqu YMMWORD PTR [rdx+352], ymm7 + vmovdqu YMMWORD PTR [r8+256], ymm8 + vmovdqu YMMWORD PTR [r8+288], ymm9 + vmovdqu YMMWORD PTR [r8+320], ymm10 + vmovdqu YMMWORD PTR [r8+352], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + vmovdqu YMMWORD PTR [rdx+416], ymm5 + vmovdqu YMMWORD PTR [rdx+448], ymm6 + vmovdqu YMMWORD PTR [rdx+480], ymm7 + vmovdqu YMMWORD PTR [r8+384], ymm8 + vmovdqu YMMWORD PTR [r8+416], ymm9 + vmovdqu YMMWORD PTR [r8+448], ymm10 + vmovdqu YMMWORD PTR [r8+480], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+512], ymm4 + vmovdqu YMMWORD PTR [rdx+544], ymm5 + vmovdqu YMMWORD PTR [rdx+576], ymm6 + vmovdqu YMMWORD PTR [rdx+608], ymm7 + vmovdqu YMMWORD PTR [r8+512], ymm8 + vmovdqu YMMWORD PTR [r8+544], ymm9 + vmovdqu YMMWORD PTR [r8+576], ymm10 + vmovdqu YMMWORD PTR [r8+608], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vmovdqu ymm2, YMMWORD PTR [rcx+704] + vmovdqu ymm3, YMMWORD PTR [rcx+736] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+640], ymm4 + vmovdqu YMMWORD PTR [rdx+672], ymm5 + vmovdqu YMMWORD PTR [rdx+704], ymm6 + vmovdqu YMMWORD PTR [rdx+736], ymm7 + vmovdqu YMMWORD PTR [r8+640], ymm8 + vmovdqu YMMWORD PTR [r8+672], ymm9 + vmovdqu YMMWORD PTR [r8+704], ymm10 + vmovdqu YMMWORD PTR [r8+736], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+768], ymm4 + vmovdqu YMMWORD PTR [rdx+800], ymm5 + vmovdqu YMMWORD PTR [rdx+832], ymm6 + vmovdqu YMMWORD PTR [rdx+864], ymm7 + vmovdqu YMMWORD PTR [r8+768], ymm8 + vmovdqu YMMWORD PTR [r8+800], ymm9 + vmovdqu YMMWORD PTR [r8+832], ymm10 + vmovdqu YMMWORD PTR [r8+864], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vmovdqu ymm2, YMMWORD PTR [rcx+960] + vmovdqu ymm3, YMMWORD PTR [rcx+992] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+896], ymm4 + vmovdqu YMMWORD PTR [rdx+928], ymm5 + vmovdqu YMMWORD PTR [rdx+960], ymm6 + vmovdqu YMMWORD PTR [rdx+992], ymm7 + vmovdqu YMMWORD PTR [r8+896], ymm8 + vmovdqu YMMWORD PTR [r8+928], ymm9 + vmovdqu YMMWORD PTR [r8+960], ymm10 + vmovdqu YMMWORD PTR [r8+992], ymm11 + ; 2/4 vectors + vmovdqu ymm0, YMMWORD PTR [rcx+1024] + vmovdqu ymm1, YMMWORD PTR [rcx+1056] + vmovdqu ymm2, YMMWORD PTR [rcx+1088] + vmovdqu ymm3, YMMWORD PTR [rcx+1120] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1024], ymm4 + vmovdqu YMMWORD PTR [rdx+1056], ymm5 + vmovdqu YMMWORD PTR [rdx+1088], ymm6 + vmovdqu YMMWORD PTR [rdx+1120], ymm7 + vmovdqu YMMWORD PTR [r8+1024], ymm8 + vmovdqu YMMWORD PTR [r8+1056], ymm9 + vmovdqu YMMWORD PTR [r8+1088], ymm10 + vmovdqu YMMWORD PTR [r8+1120], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+1152] + vmovdqu ymm1, YMMWORD PTR [rcx+1184] + vmovdqu ymm2, YMMWORD PTR [rcx+1216] + vmovdqu ymm3, YMMWORD PTR [rcx+1248] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1152], ymm4 + vmovdqu YMMWORD PTR [rdx+1184], ymm5 + vmovdqu YMMWORD PTR [rdx+1216], ymm6 + vmovdqu YMMWORD PTR [rdx+1248], ymm7 + vmovdqu YMMWORD PTR [r8+1152], ymm8 + vmovdqu YMMWORD PTR [r8+1184], ymm9 + vmovdqu YMMWORD PTR [r8+1216], ymm10 + vmovdqu YMMWORD PTR [r8+1248], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+1280] + vmovdqu ymm1, YMMWORD PTR [rcx+1312] + vmovdqu ymm2, YMMWORD PTR [rcx+1344] + vmovdqu ymm3, YMMWORD PTR [rcx+1376] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1280], ymm4 + vmovdqu YMMWORD PTR [rdx+1312], ymm5 + vmovdqu YMMWORD PTR [rdx+1344], ymm6 + vmovdqu YMMWORD PTR [rdx+1376], ymm7 + vmovdqu YMMWORD PTR [r8+1280], ymm8 + vmovdqu YMMWORD PTR [r8+1312], ymm9 + vmovdqu YMMWORD PTR [r8+1344], ymm10 + vmovdqu YMMWORD PTR [r8+1376], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+1408] + vmovdqu ymm1, YMMWORD PTR [rcx+1440] + vmovdqu ymm2, YMMWORD PTR [rcx+1472] + vmovdqu ymm3, YMMWORD PTR [rcx+1504] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1408], ymm4 + vmovdqu YMMWORD PTR [rdx+1440], ymm5 + vmovdqu YMMWORD PTR [rdx+1472], ymm6 + vmovdqu YMMWORD PTR [rdx+1504], ymm7 + vmovdqu YMMWORD PTR [r8+1408], ymm8 + vmovdqu YMMWORD PTR [r8+1440], ymm9 + vmovdqu YMMWORD PTR [r8+1472], ymm10 + vmovdqu YMMWORD PTR [r8+1504], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+1536] + vmovdqu ymm1, YMMWORD PTR [rcx+1568] + vmovdqu ymm2, YMMWORD PTR [rcx+1600] + vmovdqu ymm3, YMMWORD PTR [rcx+1632] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1536], ymm4 + vmovdqu YMMWORD PTR [rdx+1568], ymm5 + vmovdqu YMMWORD PTR [rdx+1600], ymm6 + vmovdqu YMMWORD PTR [rdx+1632], ymm7 + vmovdqu YMMWORD PTR [r8+1536], ymm8 + vmovdqu YMMWORD PTR [r8+1568], ymm9 + vmovdqu YMMWORD PTR [r8+1600], ymm10 + vmovdqu YMMWORD PTR [r8+1632], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+1664] + vmovdqu ymm1, YMMWORD PTR [rcx+1696] + vmovdqu ymm2, YMMWORD PTR [rcx+1728] + vmovdqu ymm3, YMMWORD PTR [rcx+1760] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1664], ymm4 + vmovdqu YMMWORD PTR [rdx+1696], ymm5 + vmovdqu YMMWORD PTR [rdx+1728], ymm6 + vmovdqu YMMWORD PTR [rdx+1760], ymm7 + vmovdqu YMMWORD PTR [r8+1664], ymm8 + vmovdqu YMMWORD PTR [r8+1696], ymm9 + vmovdqu YMMWORD PTR [r8+1728], ymm10 + vmovdqu YMMWORD PTR [r8+1760], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+1792] + vmovdqu ymm1, YMMWORD PTR [rcx+1824] + vmovdqu ymm2, YMMWORD PTR [rcx+1856] + vmovdqu ymm3, YMMWORD PTR [rcx+1888] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1792], ymm4 + vmovdqu YMMWORD PTR [rdx+1824], ymm5 + vmovdqu YMMWORD PTR [rdx+1856], ymm6 + vmovdqu YMMWORD PTR [rdx+1888], ymm7 + vmovdqu YMMWORD PTR [r8+1792], ymm8 + vmovdqu YMMWORD PTR [r8+1824], ymm9 + vmovdqu YMMWORD PTR [r8+1856], ymm10 + vmovdqu YMMWORD PTR [r8+1888], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+1920] + vmovdqu ymm1, YMMWORD PTR [rcx+1952] + vmovdqu ymm2, YMMWORD PTR [rcx+1984] + vmovdqu ymm3, YMMWORD PTR [rcx+2016] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+1920], ymm4 + vmovdqu YMMWORD PTR [rdx+1952], ymm5 + vmovdqu YMMWORD PTR [rdx+1984], ymm6 + vmovdqu YMMWORD PTR [rdx+2016], ymm7 + vmovdqu YMMWORD PTR [r8+1920], ymm8 + vmovdqu YMMWORD PTR [r8+1952], ymm9 + vmovdqu YMMWORD PTR [r8+1984], ymm10 + vmovdqu YMMWORD PTR [r8+2016], ymm11 + ; 3/4 vectors + vmovdqu ymm0, YMMWORD PTR [rcx+2048] + vmovdqu ymm1, YMMWORD PTR [rcx+2080] + vmovdqu ymm2, YMMWORD PTR [rcx+2112] + vmovdqu ymm3, YMMWORD PTR [rcx+2144] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2048], ymm4 + vmovdqu YMMWORD PTR [rdx+2080], ymm5 + vmovdqu YMMWORD PTR [rdx+2112], ymm6 + vmovdqu YMMWORD PTR [rdx+2144], ymm7 + vmovdqu YMMWORD PTR [r8+2048], ymm8 + vmovdqu YMMWORD PTR [r8+2080], ymm9 + vmovdqu YMMWORD PTR [r8+2112], ymm10 + vmovdqu YMMWORD PTR [r8+2144], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+2176] + vmovdqu ymm1, YMMWORD PTR [rcx+2208] + vmovdqu ymm2, YMMWORD PTR [rcx+2240] + vmovdqu ymm3, YMMWORD PTR [rcx+2272] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2176], ymm4 + vmovdqu YMMWORD PTR [rdx+2208], ymm5 + vmovdqu YMMWORD PTR [rdx+2240], ymm6 + vmovdqu YMMWORD PTR [rdx+2272], ymm7 + vmovdqu YMMWORD PTR [r8+2176], ymm8 + vmovdqu YMMWORD PTR [r8+2208], ymm9 + vmovdqu YMMWORD PTR [r8+2240], ymm10 + vmovdqu YMMWORD PTR [r8+2272], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+2304] + vmovdqu ymm1, YMMWORD PTR [rcx+2336] + vmovdqu ymm2, YMMWORD PTR [rcx+2368] + vmovdqu ymm3, YMMWORD PTR [rcx+2400] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2304], ymm4 + vmovdqu YMMWORD PTR [rdx+2336], ymm5 + vmovdqu YMMWORD PTR [rdx+2368], ymm6 + vmovdqu YMMWORD PTR [rdx+2400], ymm7 + vmovdqu YMMWORD PTR [r8+2304], ymm8 + vmovdqu YMMWORD PTR [r8+2336], ymm9 + vmovdqu YMMWORD PTR [r8+2368], ymm10 + vmovdqu YMMWORD PTR [r8+2400], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+2432] + vmovdqu ymm1, YMMWORD PTR [rcx+2464] + vmovdqu ymm2, YMMWORD PTR [rcx+2496] + vmovdqu ymm3, YMMWORD PTR [rcx+2528] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2432], ymm4 + vmovdqu YMMWORD PTR [rdx+2464], ymm5 + vmovdqu YMMWORD PTR [rdx+2496], ymm6 + vmovdqu YMMWORD PTR [rdx+2528], ymm7 + vmovdqu YMMWORD PTR [r8+2432], ymm8 + vmovdqu YMMWORD PTR [r8+2464], ymm9 + vmovdqu YMMWORD PTR [r8+2496], ymm10 + vmovdqu YMMWORD PTR [r8+2528], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+2560] + vmovdqu ymm1, YMMWORD PTR [rcx+2592] + vmovdqu ymm2, YMMWORD PTR [rcx+2624] + vmovdqu ymm3, YMMWORD PTR [rcx+2656] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2560], ymm4 + vmovdqu YMMWORD PTR [rdx+2592], ymm5 + vmovdqu YMMWORD PTR [rdx+2624], ymm6 + vmovdqu YMMWORD PTR [rdx+2656], ymm7 + vmovdqu YMMWORD PTR [r8+2560], ymm8 + vmovdqu YMMWORD PTR [r8+2592], ymm9 + vmovdqu YMMWORD PTR [r8+2624], ymm10 + vmovdqu YMMWORD PTR [r8+2656], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+2688] + vmovdqu ymm1, YMMWORD PTR [rcx+2720] + vmovdqu ymm2, YMMWORD PTR [rcx+2752] + vmovdqu ymm3, YMMWORD PTR [rcx+2784] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2688], ymm4 + vmovdqu YMMWORD PTR [rdx+2720], ymm5 + vmovdqu YMMWORD PTR [rdx+2752], ymm6 + vmovdqu YMMWORD PTR [rdx+2784], ymm7 + vmovdqu YMMWORD PTR [r8+2688], ymm8 + vmovdqu YMMWORD PTR [r8+2720], ymm9 + vmovdqu YMMWORD PTR [r8+2752], ymm10 + vmovdqu YMMWORD PTR [r8+2784], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+2816] + vmovdqu ymm1, YMMWORD PTR [rcx+2848] + vmovdqu ymm2, YMMWORD PTR [rcx+2880] + vmovdqu ymm3, YMMWORD PTR [rcx+2912] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2816], ymm4 + vmovdqu YMMWORD PTR [rdx+2848], ymm5 + vmovdqu YMMWORD PTR [rdx+2880], ymm6 + vmovdqu YMMWORD PTR [rdx+2912], ymm7 + vmovdqu YMMWORD PTR [r8+2816], ymm8 + vmovdqu YMMWORD PTR [r8+2848], ymm9 + vmovdqu YMMWORD PTR [r8+2880], ymm10 + vmovdqu YMMWORD PTR [r8+2912], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+2944] + vmovdqu ymm1, YMMWORD PTR [rcx+2976] + vmovdqu ymm2, YMMWORD PTR [rcx+3008] + vmovdqu ymm3, YMMWORD PTR [rcx+3040] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+2944], ymm4 + vmovdqu YMMWORD PTR [rdx+2976], ymm5 + vmovdqu YMMWORD PTR [rdx+3008], ymm6 + vmovdqu YMMWORD PTR [rdx+3040], ymm7 + vmovdqu YMMWORD PTR [r8+2944], ymm8 + vmovdqu YMMWORD PTR [r8+2976], ymm9 + vmovdqu YMMWORD PTR [r8+3008], ymm10 + vmovdqu YMMWORD PTR [r8+3040], ymm11 + ; 4/4 vectors + vmovdqu ymm0, YMMWORD PTR [rcx+3072] + vmovdqu ymm1, YMMWORD PTR [rcx+3104] + vmovdqu ymm2, YMMWORD PTR [rcx+3136] + vmovdqu ymm3, YMMWORD PTR [rcx+3168] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3072], ymm4 + vmovdqu YMMWORD PTR [rdx+3104], ymm5 + vmovdqu YMMWORD PTR [rdx+3136], ymm6 + vmovdqu YMMWORD PTR [rdx+3168], ymm7 + vmovdqu YMMWORD PTR [r8+3072], ymm8 + vmovdqu YMMWORD PTR [r8+3104], ymm9 + vmovdqu YMMWORD PTR [r8+3136], ymm10 + vmovdqu YMMWORD PTR [r8+3168], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+3200] + vmovdqu ymm1, YMMWORD PTR [rcx+3232] + vmovdqu ymm2, YMMWORD PTR [rcx+3264] + vmovdqu ymm3, YMMWORD PTR [rcx+3296] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3200], ymm4 + vmovdqu YMMWORD PTR [rdx+3232], ymm5 + vmovdqu YMMWORD PTR [rdx+3264], ymm6 + vmovdqu YMMWORD PTR [rdx+3296], ymm7 + vmovdqu YMMWORD PTR [r8+3200], ymm8 + vmovdqu YMMWORD PTR [r8+3232], ymm9 + vmovdqu YMMWORD PTR [r8+3264], ymm10 + vmovdqu YMMWORD PTR [r8+3296], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+3328] + vmovdqu ymm1, YMMWORD PTR [rcx+3360] + vmovdqu ymm2, YMMWORD PTR [rcx+3392] + vmovdqu ymm3, YMMWORD PTR [rcx+3424] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3328], ymm4 + vmovdqu YMMWORD PTR [rdx+3360], ymm5 + vmovdqu YMMWORD PTR [rdx+3392], ymm6 + vmovdqu YMMWORD PTR [rdx+3424], ymm7 + vmovdqu YMMWORD PTR [r8+3328], ymm8 + vmovdqu YMMWORD PTR [r8+3360], ymm9 + vmovdqu YMMWORD PTR [r8+3392], ymm10 + vmovdqu YMMWORD PTR [r8+3424], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+3456] + vmovdqu ymm1, YMMWORD PTR [rcx+3488] + vmovdqu ymm2, YMMWORD PTR [rcx+3520] + vmovdqu ymm3, YMMWORD PTR [rcx+3552] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3456], ymm4 + vmovdqu YMMWORD PTR [rdx+3488], ymm5 + vmovdqu YMMWORD PTR [rdx+3520], ymm6 + vmovdqu YMMWORD PTR [rdx+3552], ymm7 + vmovdqu YMMWORD PTR [r8+3456], ymm8 + vmovdqu YMMWORD PTR [r8+3488], ymm9 + vmovdqu YMMWORD PTR [r8+3520], ymm10 + vmovdqu YMMWORD PTR [r8+3552], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+3584] + vmovdqu ymm1, YMMWORD PTR [rcx+3616] + vmovdqu ymm2, YMMWORD PTR [rcx+3648] + vmovdqu ymm3, YMMWORD PTR [rcx+3680] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3584], ymm4 + vmovdqu YMMWORD PTR [rdx+3616], ymm5 + vmovdqu YMMWORD PTR [rdx+3648], ymm6 + vmovdqu YMMWORD PTR [rdx+3680], ymm7 + vmovdqu YMMWORD PTR [r8+3584], ymm8 + vmovdqu YMMWORD PTR [r8+3616], ymm9 + vmovdqu YMMWORD PTR [r8+3648], ymm10 + vmovdqu YMMWORD PTR [r8+3680], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+3712] + vmovdqu ymm1, YMMWORD PTR [rcx+3744] + vmovdqu ymm2, YMMWORD PTR [rcx+3776] + vmovdqu ymm3, YMMWORD PTR [rcx+3808] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3712], ymm4 + vmovdqu YMMWORD PTR [rdx+3744], ymm5 + vmovdqu YMMWORD PTR [rdx+3776], ymm6 + vmovdqu YMMWORD PTR [rdx+3808], ymm7 + vmovdqu YMMWORD PTR [r8+3712], ymm8 + vmovdqu YMMWORD PTR [r8+3744], ymm9 + vmovdqu YMMWORD PTR [r8+3776], ymm10 + vmovdqu YMMWORD PTR [r8+3808], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+3840] + vmovdqu ymm1, YMMWORD PTR [rcx+3872] + vmovdqu ymm2, YMMWORD PTR [rcx+3904] + vmovdqu ymm3, YMMWORD PTR [rcx+3936] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3840], ymm4 + vmovdqu YMMWORD PTR [rdx+3872], ymm5 + vmovdqu YMMWORD PTR [rdx+3904], ymm6 + vmovdqu YMMWORD PTR [rdx+3936], ymm7 + vmovdqu YMMWORD PTR [r8+3840], ymm8 + vmovdqu YMMWORD PTR [r8+3872], ymm9 + vmovdqu YMMWORD PTR [r8+3904], ymm10 + vmovdqu YMMWORD PTR [r8+3936], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+3968] + vmovdqu ymm1, YMMWORD PTR [rcx+4000] + vmovdqu ymm2, YMMWORD PTR [rcx+4032] + vmovdqu ymm3, YMMWORD PTR [rcx+4064] + vpmulld ymm8, ymm0, ymm15 + vpmulld ymm9, ymm1, ymm15 + vpmulld ymm10, ymm2, ymm15 + vpmulld ymm11, ymm3, ymm15 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm9, ymm9, ymm14 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm14 + vpsrld ymm8, ymm8, 23 + vpsrld ymm9, ymm9, 23 + vpsrld ymm10, ymm10, 23 + vpsrld ymm11, ymm11, 23 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpcmpeqd ymm0, ymm8, ymm15 + vpcmpeqd ymm1, ymm9, ymm15 + vpcmpeqd ymm2, ymm10, ymm15 + vpcmpeqd ymm3, ymm11, ymm15 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vpaddd ymm6, ymm6, ymm2 + vpaddd ymm7, ymm7, ymm3 + vpcmpgtd ymm0, ymm15, ymm8 + vpcmpgtd ymm1, ymm15, ymm9 + vpcmpgtd ymm2, ymm15, ymm10 + vpcmpgtd ymm3, ymm15, ymm11 + vpand ymm8, ymm8, ymm0 + vpand ymm9, ymm9, ymm1 + vpand ymm10, ymm10, ymm2 + vpand ymm11, ymm11, ymm3 + vmovdqu YMMWORD PTR [rdx+3968], ymm4 + vmovdqu YMMWORD PTR [rdx+4000], ymm5 + vmovdqu YMMWORD PTR [rdx+4032], ymm6 + vmovdqu YMMWORD PTR [rdx+4064], ymm7 + vmovdqu YMMWORD PTR [r8+3968], ymm8 + vmovdqu YMMWORD PTR [r8+4000], ymm9 + vmovdqu YMMWORD PTR [r8+4032], ymm10 + vmovdqu YMMWORD PTR [r8+4064], ymm11 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_decompose_q88_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q32_avx2_q_low_32 DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h + DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h +ptr_L_mldsa_decompose_q32_avx2_q_low_32 QWORD L_mldsa_decompose_q32_avx2_q_low_32 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q32_avx2_q_low_32_2 DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h + DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h +ptr_L_mldsa_decompose_q32_avx2_q_low_32_2 QWORD L_mldsa_decompose_q32_avx2_q_low_32_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q32_avx2_q_low_32_m1 DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh + DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh +ptr_L_mldsa_decompose_q32_avx2_q_low_32_m1 QWORD L_mldsa_decompose_q32_avx2_q_low_32_m1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_decompose_q32_avx2_mask DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh + DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh +ptr_L_mldsa_decompose_q32_avx2_mask QWORD L_mldsa_decompose_q32_avx2_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_decompose_q32_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm12, YMMWORD PTR L_mldsa_decompose_q32_avx2_q_low_32 + vmovdqu ymm13, YMMWORD PTR L_mldsa_decompose_q32_avx2_q_low_32_2 + vmovdqu ymm14, YMMWORD PTR L_mldsa_decompose_q32_avx2_q_low_32_m1 + vmovdqu ymm15, YMMWORD PTR L_mldsa_decompose_q32_avx2_mask +L_mldsa_decompose_q32_avx2_start_256: + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8], ymm4 + vmovdqu YMMWORD PTR [r8+32], ymm5 + vmovdqu YMMWORD PTR [r8+64], ymm6 + vmovdqu YMMWORD PTR [r8+96], ymm7 + vmovdqu YMMWORD PTR [r9], ymm8 + vmovdqu YMMWORD PTR [r9+32], ymm9 + vmovdqu YMMWORD PTR [r9+64], ymm10 + vmovdqu YMMWORD PTR [r9+96], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8+128], ymm4 + vmovdqu YMMWORD PTR [r8+160], ymm5 + vmovdqu YMMWORD PTR [r8+192], ymm6 + vmovdqu YMMWORD PTR [r8+224], ymm7 + vmovdqu YMMWORD PTR [r9+128], ymm8 + vmovdqu YMMWORD PTR [r9+160], ymm9 + vmovdqu YMMWORD PTR [r9+192], ymm10 + vmovdqu YMMWORD PTR [r9+224], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8+256], ymm4 + vmovdqu YMMWORD PTR [r8+288], ymm5 + vmovdqu YMMWORD PTR [r8+320], ymm6 + vmovdqu YMMWORD PTR [r8+352], ymm7 + vmovdqu YMMWORD PTR [r9+256], ymm8 + vmovdqu YMMWORD PTR [r9+288], ymm9 + vmovdqu YMMWORD PTR [r9+320], ymm10 + vmovdqu YMMWORD PTR [r9+352], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8+384], ymm4 + vmovdqu YMMWORD PTR [r8+416], ymm5 + vmovdqu YMMWORD PTR [r8+448], ymm6 + vmovdqu YMMWORD PTR [r8+480], ymm7 + vmovdqu YMMWORD PTR [r9+384], ymm8 + vmovdqu YMMWORD PTR [r9+416], ymm9 + vmovdqu YMMWORD PTR [r9+448], ymm10 + vmovdqu YMMWORD PTR [r9+480], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8+512], ymm4 + vmovdqu YMMWORD PTR [r8+544], ymm5 + vmovdqu YMMWORD PTR [r8+576], ymm6 + vmovdqu YMMWORD PTR [r8+608], ymm7 + vmovdqu YMMWORD PTR [r9+512], ymm8 + vmovdqu YMMWORD PTR [r9+544], ymm9 + vmovdqu YMMWORD PTR [r9+576], ymm10 + vmovdqu YMMWORD PTR [r9+608], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vmovdqu ymm2, YMMWORD PTR [rcx+704] + vmovdqu ymm3, YMMWORD PTR [rcx+736] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8+640], ymm4 + vmovdqu YMMWORD PTR [r8+672], ymm5 + vmovdqu YMMWORD PTR [r8+704], ymm6 + vmovdqu YMMWORD PTR [r8+736], ymm7 + vmovdqu YMMWORD PTR [r9+640], ymm8 + vmovdqu YMMWORD PTR [r9+672], ymm9 + vmovdqu YMMWORD PTR [r9+704], ymm10 + vmovdqu YMMWORD PTR [r9+736], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8+768], ymm4 + vmovdqu YMMWORD PTR [r8+800], ymm5 + vmovdqu YMMWORD PTR [r8+832], ymm6 + vmovdqu YMMWORD PTR [r8+864], ymm7 + vmovdqu YMMWORD PTR [r9+768], ymm8 + vmovdqu YMMWORD PTR [r9+800], ymm9 + vmovdqu YMMWORD PTR [r9+832], ymm10 + vmovdqu YMMWORD PTR [r9+864], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vmovdqu ymm2, YMMWORD PTR [rcx+960] + vmovdqu ymm3, YMMWORD PTR [rcx+992] + vpaddd ymm8, ymm0, ymm14 + vpaddd ymm9, ymm1, ymm14 + vpaddd ymm10, ymm2, ymm14 + vpaddd ymm11, ymm3, ymm14 + vpsrld ymm8, ymm8, 19 + vpsrld ymm9, ymm9, 19 + vpsrld ymm10, ymm10, 19 + vpsrld ymm11, ymm11, 19 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsubd ymm4, ymm12, ymm4 + vpsubd ymm5, ymm12, ymm5 + vpsubd ymm6, ymm12, ymm6 + vpsubd ymm7, ymm12, ymm7 + vpsrld ymm4, ymm4, 31 + vpsrld ymm5, ymm5, 31 + vpsrld ymm6, ymm6, 31 + vpsrld ymm7, ymm7, 31 + vpaddd ymm8, ymm8, ymm4 + vpaddd ymm9, ymm9, ymm5 + vpaddd ymm10, ymm10, ymm6 + vpaddd ymm11, ymm11, ymm7 + vpmulld ymm4, ymm8, ymm13 + vpmulld ymm5, ymm9, ymm13 + vpmulld ymm6, ymm10, ymm13 + vpmulld ymm7, ymm11, ymm13 + vpsubd ymm4, ymm0, ymm4 + vpsubd ymm5, ymm1, ymm5 + vpsubd ymm6, ymm2, ymm6 + vpsubd ymm7, ymm3, ymm7 + vpsrld ymm0, ymm8, 4 + vpsrld ymm1, ymm9, 4 + vpsrld ymm2, ymm10, 4 + vpsrld ymm3, ymm11, 4 + vpsubd ymm4, ymm4, ymm0 + vpsubd ymm5, ymm5, ymm1 + vpsubd ymm6, ymm6, ymm2 + vpsubd ymm7, ymm7, ymm3 + vpand ymm8, ymm8, ymm15 + vpand ymm9, ymm9, ymm15 + vpand ymm10, ymm10, ymm15 + vpand ymm11, ymm11, ymm15 + vmovdqu YMMWORD PTR [r8+896], ymm4 + vmovdqu YMMWORD PTR [r8+928], ymm5 + vmovdqu YMMWORD PTR [r8+960], ymm6 + vmovdqu YMMWORD PTR [r8+992], ymm7 + vmovdqu YMMWORD PTR [r9+896], ymm8 + vmovdqu YMMWORD PTR [r9+928], ymm9 + vmovdqu YMMWORD PTR [r9+960], ymm10 + vmovdqu YMMWORD PTR [r9+992], ymm11 + add rcx, 1024 + add r8, 1024 + add r9, 1024 + sub rdx, 1 + jne L_mldsa_decompose_q32_avx2_start_256 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_decompose_q32_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_88_avx2_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h + DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h +ptr_L_mldsa_use_hint_88_avx2_q QWORD L_mldsa_use_hint_88_avx2_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_88_avx2_q_low_88 DWORD 00017400h, 00017400h, 00017400h, 00017400h + DWORD 00017400h, 00017400h, 00017400h, 00017400h +ptr_L_mldsa_use_hint_88_avx2_q_low_88 QWORD L_mldsa_use_hint_88_avx2_q_low_88 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_88_avx2_q_low_88_2 DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h + DWORD 0002e800h, 0002e800h, 0002e800h, 0002e800h +ptr_L_mldsa_use_hint_88_avx2_q_low_88_2 QWORD L_mldsa_use_hint_88_avx2_q_low_88_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_88_avx2_q_2 DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h + DWORD 003fefd4h, 003fefd4h, 003fefd4h, 003fefd4h +ptr_L_mldsa_use_hint_88_avx2_q_2 QWORD L_mldsa_use_hint_88_avx2_q_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_88_avx2_44 DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch + DWORD 0000002ch, 0000002ch, 0000002ch, 0000002ch +ptr_L_mldsa_use_hint_88_avx2_44 QWORD L_mldsa_use_hint_88_avx2_44 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_88_avx2_vsl DWORD 0000001fh, 0000001eh, 0000001dh, 0000001ch + DWORD 0000001bh, 0000001ah, 00000019h, 00000018h +ptr_L_mldsa_use_hint_88_avx2_vsl QWORD L_mldsa_use_hint_88_avx2_vsl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_88_avx2_one DWORD 00000001h, 00000001h, 00000001h, 00000001h + DWORD 00000001h, 00000001h, 00000001h, 00000001h +ptr_L_mldsa_use_hint_88_avx2_one QWORD L_mldsa_use_hint_88_avx2_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_use_hint_88_avx2 PROC + push r12 + push r13 + mov rax, rdx + mov rdx, rcx + sub rsp, 144 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR L_mldsa_use_hint_88_avx2_q + vmovdqu ymm9, YMMWORD PTR L_mldsa_use_hint_88_avx2_q_low_88 + vmovdqu ymm10, YMMWORD PTR L_mldsa_use_hint_88_avx2_q_low_88_2 + vmovdqu ymm11, YMMWORD PTR L_mldsa_use_hint_88_avx2_q_2 + vmovdqu ymm12, YMMWORD PTR L_mldsa_use_hint_88_avx2_44 + vmovdqu ymm13, YMMWORD PTR L_mldsa_use_hint_88_avx2_vsl + vmovdqu ymm14, YMMWORD PTR L_mldsa_use_hint_88_avx2_one + xor r9, r9 + mov r10b, [rax] + ; 1/4 vectors + mov r8b, [rax+80] + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_0: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_0 + mov cl, r10b + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_0 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_0 +L_mldsa_use_hint_88_avx2_hints_done_0_0: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx], ymm4 + vmovdqu YMMWORD PTR [rdx+32], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm1, YMMWORD PTR [rdx+96] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_1: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_1 + mov cl, r10b + sub cl, 16 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_1 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_1 +L_mldsa_use_hint_88_avx2_hints_done_0_1: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm4 + vmovdqu YMMWORD PTR [rdx+96], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm1, YMMWORD PTR [rdx+160] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_2: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_2 + mov cl, r10b + sub cl, 32 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_2 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_2 +L_mldsa_use_hint_88_avx2_hints_done_0_2: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + vmovdqu YMMWORD PTR [rdx+160], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm1, YMMWORD PTR [rdx+224] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_3: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_3 + mov cl, r10b + sub cl, 48 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_3 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_3 +L_mldsa_use_hint_88_avx2_hints_done_0_3: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+192], ymm4 + vmovdqu YMMWORD PTR [rdx+224], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_4: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_4 + mov cl, r10b + sub cl, 64 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_4 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_4 +L_mldsa_use_hint_88_avx2_hints_done_0_4: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+256], ymm4 + vmovdqu YMMWORD PTR [rdx+288], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm1, YMMWORD PTR [rdx+352] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_5: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_5 + mov cl, r10b + sub cl, 80 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_5 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_5 +L_mldsa_use_hint_88_avx2_hints_done_0_5: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+320], ymm4 + vmovdqu YMMWORD PTR [rdx+352], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm1, YMMWORD PTR [rdx+416] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_6: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_6 + mov cl, r10b + sub cl, 96 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_6 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_6 +L_mldsa_use_hint_88_avx2_hints_done_0_6: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + vmovdqu YMMWORD PTR [rdx+416], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm1, YMMWORD PTR [rdx+480] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_7: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_7 + mov cl, r10b + sub cl, 112 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_7 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_7 +L_mldsa_use_hint_88_avx2_hints_done_0_7: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+448], ymm4 + vmovdqu YMMWORD PTR [rdx+480], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+512] + vmovdqu ymm1, YMMWORD PTR [rdx+544] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_8: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_8 + mov cl, r10b + sub cl, 128 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_8 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_8 +L_mldsa_use_hint_88_avx2_hints_done_0_8: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+512], ymm4 + vmovdqu YMMWORD PTR [rdx+544], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+576] + vmovdqu ymm1, YMMWORD PTR [rdx+608] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_9: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_9 + mov cl, r10b + sub cl, 144 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_9 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_9 +L_mldsa_use_hint_88_avx2_hints_done_0_9: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+576], ymm4 + vmovdqu YMMWORD PTR [rdx+608], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+640] + vmovdqu ymm1, YMMWORD PTR [rdx+672] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_10: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_10 + mov cl, r10b + sub cl, 160 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_10 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_10 +L_mldsa_use_hint_88_avx2_hints_done_0_10: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+640], ymm4 + vmovdqu YMMWORD PTR [rdx+672], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+704] + vmovdqu ymm1, YMMWORD PTR [rdx+736] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_11: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_11 + mov cl, r10b + sub cl, 176 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_11 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_11 +L_mldsa_use_hint_88_avx2_hints_done_0_11: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+704], ymm4 + vmovdqu YMMWORD PTR [rdx+736], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+768] + vmovdqu ymm1, YMMWORD PTR [rdx+800] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_12: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_12 + mov cl, r10b + sub cl, 192 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_12 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_12 +L_mldsa_use_hint_88_avx2_hints_done_0_12: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+768], ymm4 + vmovdqu YMMWORD PTR [rdx+800], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+832] + vmovdqu ymm1, YMMWORD PTR [rdx+864] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_13: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_13 + mov cl, r10b + sub cl, 208 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_13 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_13 +L_mldsa_use_hint_88_avx2_hints_done_0_13: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+832], ymm4 + vmovdqu YMMWORD PTR [rdx+864], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+896] + vmovdqu ymm1, YMMWORD PTR [rdx+928] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_14: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_14 + mov cl, r10b + sub cl, 224 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_14 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_14 +L_mldsa_use_hint_88_avx2_hints_done_0_14: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+896], ymm4 + vmovdqu YMMWORD PTR [rdx+928], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+960] + vmovdqu ymm1, YMMWORD PTR [rdx+992] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_0_15: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_0_15 + mov cl, r10b + sub cl, 240 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_0_15 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_0_15 +L_mldsa_use_hint_88_avx2_hints_done_0_15: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+960], ymm4 + vmovdqu YMMWORD PTR [rdx+992], ymm5 + ; 2/4 vectors + mov r8b, [rax+81] + vmovdqu ymm0, YMMWORD PTR [rdx+1024] + vmovdqu ymm1, YMMWORD PTR [rdx+1056] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_0: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_0 + mov cl, r10b + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_0 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_0 +L_mldsa_use_hint_88_avx2_hints_done_1_0: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1024], ymm4 + vmovdqu YMMWORD PTR [rdx+1056], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1088] + vmovdqu ymm1, YMMWORD PTR [rdx+1120] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_1: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_1 + mov cl, r10b + sub cl, 16 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_1 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_1 +L_mldsa_use_hint_88_avx2_hints_done_1_1: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1088], ymm4 + vmovdqu YMMWORD PTR [rdx+1120], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1152] + vmovdqu ymm1, YMMWORD PTR [rdx+1184] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_2: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_2 + mov cl, r10b + sub cl, 32 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_2 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_2 +L_mldsa_use_hint_88_avx2_hints_done_1_2: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1152], ymm4 + vmovdqu YMMWORD PTR [rdx+1184], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1216] + vmovdqu ymm1, YMMWORD PTR [rdx+1248] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_3: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_3 + mov cl, r10b + sub cl, 48 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_3 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_3 +L_mldsa_use_hint_88_avx2_hints_done_1_3: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1216], ymm4 + vmovdqu YMMWORD PTR [rdx+1248], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1280] + vmovdqu ymm1, YMMWORD PTR [rdx+1312] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_4: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_4 + mov cl, r10b + sub cl, 64 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_4 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_4 +L_mldsa_use_hint_88_avx2_hints_done_1_4: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1280], ymm4 + vmovdqu YMMWORD PTR [rdx+1312], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1344] + vmovdqu ymm1, YMMWORD PTR [rdx+1376] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_5: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_5 + mov cl, r10b + sub cl, 80 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_5 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_5 +L_mldsa_use_hint_88_avx2_hints_done_1_5: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1344], ymm4 + vmovdqu YMMWORD PTR [rdx+1376], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1408] + vmovdqu ymm1, YMMWORD PTR [rdx+1440] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_6: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_6 + mov cl, r10b + sub cl, 96 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_6 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_6 +L_mldsa_use_hint_88_avx2_hints_done_1_6: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1408], ymm4 + vmovdqu YMMWORD PTR [rdx+1440], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1472] + vmovdqu ymm1, YMMWORD PTR [rdx+1504] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_7: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_7 + mov cl, r10b + sub cl, 112 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_7 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_7 +L_mldsa_use_hint_88_avx2_hints_done_1_7: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1472], ymm4 + vmovdqu YMMWORD PTR [rdx+1504], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1536] + vmovdqu ymm1, YMMWORD PTR [rdx+1568] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_8: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_8 + mov cl, r10b + sub cl, 128 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_8 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_8 +L_mldsa_use_hint_88_avx2_hints_done_1_8: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1536], ymm4 + vmovdqu YMMWORD PTR [rdx+1568], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1600] + vmovdqu ymm1, YMMWORD PTR [rdx+1632] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_9: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_9 + mov cl, r10b + sub cl, 144 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_9 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_9 +L_mldsa_use_hint_88_avx2_hints_done_1_9: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1600], ymm4 + vmovdqu YMMWORD PTR [rdx+1632], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1664] + vmovdqu ymm1, YMMWORD PTR [rdx+1696] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_10: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_10 + mov cl, r10b + sub cl, 160 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_10 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_10 +L_mldsa_use_hint_88_avx2_hints_done_1_10: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1664], ymm4 + vmovdqu YMMWORD PTR [rdx+1696], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1728] + vmovdqu ymm1, YMMWORD PTR [rdx+1760] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_11: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_11 + mov cl, r10b + sub cl, 176 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_11 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_11 +L_mldsa_use_hint_88_avx2_hints_done_1_11: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1728], ymm4 + vmovdqu YMMWORD PTR [rdx+1760], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1792] + vmovdqu ymm1, YMMWORD PTR [rdx+1824] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_12: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_12 + mov cl, r10b + sub cl, 192 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_12 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_12 +L_mldsa_use_hint_88_avx2_hints_done_1_12: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1792], ymm4 + vmovdqu YMMWORD PTR [rdx+1824], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1856] + vmovdqu ymm1, YMMWORD PTR [rdx+1888] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_13: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_13 + mov cl, r10b + sub cl, 208 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_13 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_13 +L_mldsa_use_hint_88_avx2_hints_done_1_13: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1856], ymm4 + vmovdqu YMMWORD PTR [rdx+1888], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1920] + vmovdqu ymm1, YMMWORD PTR [rdx+1952] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_14: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_14 + mov cl, r10b + sub cl, 224 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_14 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_14 +L_mldsa_use_hint_88_avx2_hints_done_1_14: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1920], ymm4 + vmovdqu YMMWORD PTR [rdx+1952], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+1984] + vmovdqu ymm1, YMMWORD PTR [rdx+2016] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_1_15: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_1_15 + mov cl, r10b + sub cl, 240 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_1_15 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_1_15 +L_mldsa_use_hint_88_avx2_hints_done_1_15: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+1984], ymm4 + vmovdqu YMMWORD PTR [rdx+2016], ymm5 + ; 3/4 vectors + mov r8b, [rax+82] + vmovdqu ymm0, YMMWORD PTR [rdx+2048] + vmovdqu ymm1, YMMWORD PTR [rdx+2080] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_0: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_0 + mov cl, r10b + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_0 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_0 +L_mldsa_use_hint_88_avx2_hints_done_2_0: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2048], ymm4 + vmovdqu YMMWORD PTR [rdx+2080], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2112] + vmovdqu ymm1, YMMWORD PTR [rdx+2144] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_1: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_1 + mov cl, r10b + sub cl, 16 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_1 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_1 +L_mldsa_use_hint_88_avx2_hints_done_2_1: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2112], ymm4 + vmovdqu YMMWORD PTR [rdx+2144], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2176] + vmovdqu ymm1, YMMWORD PTR [rdx+2208] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_2: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_2 + mov cl, r10b + sub cl, 32 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_2 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_2 +L_mldsa_use_hint_88_avx2_hints_done_2_2: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2176], ymm4 + vmovdqu YMMWORD PTR [rdx+2208], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2240] + vmovdqu ymm1, YMMWORD PTR [rdx+2272] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_3: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_3 + mov cl, r10b + sub cl, 48 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_3 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_3 +L_mldsa_use_hint_88_avx2_hints_done_2_3: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2240], ymm4 + vmovdqu YMMWORD PTR [rdx+2272], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2304] + vmovdqu ymm1, YMMWORD PTR [rdx+2336] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_4: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_4 + mov cl, r10b + sub cl, 64 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_4 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_4 +L_mldsa_use_hint_88_avx2_hints_done_2_4: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2304], ymm4 + vmovdqu YMMWORD PTR [rdx+2336], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2368] + vmovdqu ymm1, YMMWORD PTR [rdx+2400] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_5: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_5 + mov cl, r10b + sub cl, 80 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_5 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_5 +L_mldsa_use_hint_88_avx2_hints_done_2_5: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2368], ymm4 + vmovdqu YMMWORD PTR [rdx+2400], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2432] + vmovdqu ymm1, YMMWORD PTR [rdx+2464] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_6: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_6 + mov cl, r10b + sub cl, 96 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_6 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_6 +L_mldsa_use_hint_88_avx2_hints_done_2_6: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2432], ymm4 + vmovdqu YMMWORD PTR [rdx+2464], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2496] + vmovdqu ymm1, YMMWORD PTR [rdx+2528] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_7: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_7 + mov cl, r10b + sub cl, 112 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_7 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_7 +L_mldsa_use_hint_88_avx2_hints_done_2_7: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2496], ymm4 + vmovdqu YMMWORD PTR [rdx+2528], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2560] + vmovdqu ymm1, YMMWORD PTR [rdx+2592] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_8: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_8 + mov cl, r10b + sub cl, 128 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_8 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_8 +L_mldsa_use_hint_88_avx2_hints_done_2_8: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2560], ymm4 + vmovdqu YMMWORD PTR [rdx+2592], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2624] + vmovdqu ymm1, YMMWORD PTR [rdx+2656] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_9: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_9 + mov cl, r10b + sub cl, 144 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_9 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_9 +L_mldsa_use_hint_88_avx2_hints_done_2_9: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2624], ymm4 + vmovdqu YMMWORD PTR [rdx+2656], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2688] + vmovdqu ymm1, YMMWORD PTR [rdx+2720] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_10: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_10 + mov cl, r10b + sub cl, 160 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_10 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_10 +L_mldsa_use_hint_88_avx2_hints_done_2_10: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2688], ymm4 + vmovdqu YMMWORD PTR [rdx+2720], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2752] + vmovdqu ymm1, YMMWORD PTR [rdx+2784] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_11: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_11 + mov cl, r10b + sub cl, 176 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_11 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_11 +L_mldsa_use_hint_88_avx2_hints_done_2_11: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2752], ymm4 + vmovdqu YMMWORD PTR [rdx+2784], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2816] + vmovdqu ymm1, YMMWORD PTR [rdx+2848] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_12: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_12 + mov cl, r10b + sub cl, 192 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_12 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_12 +L_mldsa_use_hint_88_avx2_hints_done_2_12: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2816], ymm4 + vmovdqu YMMWORD PTR [rdx+2848], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2880] + vmovdqu ymm1, YMMWORD PTR [rdx+2912] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_13: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_13 + mov cl, r10b + sub cl, 208 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_13 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_13 +L_mldsa_use_hint_88_avx2_hints_done_2_13: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2880], ymm4 + vmovdqu YMMWORD PTR [rdx+2912], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+2944] + vmovdqu ymm1, YMMWORD PTR [rdx+2976] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_14: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_14 + mov cl, r10b + sub cl, 224 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_14 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_14 +L_mldsa_use_hint_88_avx2_hints_done_2_14: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+2944], ymm4 + vmovdqu YMMWORD PTR [rdx+2976], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3008] + vmovdqu ymm1, YMMWORD PTR [rdx+3040] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_2_15: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_2_15 + mov cl, r10b + sub cl, 240 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_2_15 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_2_15 +L_mldsa_use_hint_88_avx2_hints_done_2_15: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3008], ymm4 + vmovdqu YMMWORD PTR [rdx+3040], ymm5 + ; 4/4 vectors + mov r8b, [rax+83] + vmovdqu ymm0, YMMWORD PTR [rdx+3072] + vmovdqu ymm1, YMMWORD PTR [rdx+3104] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_0: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_0 + mov cl, r10b + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_0 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_0 +L_mldsa_use_hint_88_avx2_hints_done_3_0: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3072], ymm4 + vmovdqu YMMWORD PTR [rdx+3104], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3136] + vmovdqu ymm1, YMMWORD PTR [rdx+3168] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_1: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_1 + mov cl, r10b + sub cl, 16 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_1 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_1 +L_mldsa_use_hint_88_avx2_hints_done_3_1: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3136], ymm4 + vmovdqu YMMWORD PTR [rdx+3168], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3200] + vmovdqu ymm1, YMMWORD PTR [rdx+3232] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_2: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_2 + mov cl, r10b + sub cl, 32 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_2 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_2 +L_mldsa_use_hint_88_avx2_hints_done_3_2: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3200], ymm4 + vmovdqu YMMWORD PTR [rdx+3232], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3264] + vmovdqu ymm1, YMMWORD PTR [rdx+3296] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_3: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_3 + mov cl, r10b + sub cl, 48 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_3 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_3 +L_mldsa_use_hint_88_avx2_hints_done_3_3: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3264], ymm4 + vmovdqu YMMWORD PTR [rdx+3296], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3328] + vmovdqu ymm1, YMMWORD PTR [rdx+3360] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_4: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_4 + mov cl, r10b + sub cl, 64 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_4 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_4 +L_mldsa_use_hint_88_avx2_hints_done_3_4: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3328], ymm4 + vmovdqu YMMWORD PTR [rdx+3360], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3392] + vmovdqu ymm1, YMMWORD PTR [rdx+3424] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_5: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_5 + mov cl, r10b + sub cl, 80 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_5 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_5 +L_mldsa_use_hint_88_avx2_hints_done_3_5: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3392], ymm4 + vmovdqu YMMWORD PTR [rdx+3424], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3456] + vmovdqu ymm1, YMMWORD PTR [rdx+3488] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_6: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_6 + mov cl, r10b + sub cl, 96 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_6 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_6 +L_mldsa_use_hint_88_avx2_hints_done_3_6: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3456], ymm4 + vmovdqu YMMWORD PTR [rdx+3488], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3520] + vmovdqu ymm1, YMMWORD PTR [rdx+3552] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_7: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_7 + mov cl, r10b + sub cl, 112 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_7 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_7 +L_mldsa_use_hint_88_avx2_hints_done_3_7: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3520], ymm4 + vmovdqu YMMWORD PTR [rdx+3552], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3584] + vmovdqu ymm1, YMMWORD PTR [rdx+3616] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_8: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_8 + mov cl, r10b + sub cl, 128 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_8 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_8 +L_mldsa_use_hint_88_avx2_hints_done_3_8: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3584], ymm4 + vmovdqu YMMWORD PTR [rdx+3616], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3648] + vmovdqu ymm1, YMMWORD PTR [rdx+3680] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_9: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_9 + mov cl, r10b + sub cl, 144 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_9 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_9 +L_mldsa_use_hint_88_avx2_hints_done_3_9: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3648], ymm4 + vmovdqu YMMWORD PTR [rdx+3680], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3712] + vmovdqu ymm1, YMMWORD PTR [rdx+3744] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_10: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_10 + mov cl, r10b + sub cl, 160 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_10 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_10 +L_mldsa_use_hint_88_avx2_hints_done_3_10: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3712], ymm4 + vmovdqu YMMWORD PTR [rdx+3744], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3776] + vmovdqu ymm1, YMMWORD PTR [rdx+3808] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_11: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_11 + mov cl, r10b + sub cl, 176 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_11 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_11 +L_mldsa_use_hint_88_avx2_hints_done_3_11: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3776], ymm4 + vmovdqu YMMWORD PTR [rdx+3808], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3840] + vmovdqu ymm1, YMMWORD PTR [rdx+3872] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_12: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_12 + mov cl, r10b + sub cl, 192 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_12 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_12 +L_mldsa_use_hint_88_avx2_hints_done_3_12: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3840], ymm4 + vmovdqu YMMWORD PTR [rdx+3872], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3904] + vmovdqu ymm1, YMMWORD PTR [rdx+3936] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_13: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_13 + mov cl, r10b + sub cl, 208 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_13 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_13 +L_mldsa_use_hint_88_avx2_hints_done_3_13: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3904], ymm4 + vmovdqu YMMWORD PTR [rdx+3936], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+3968] + vmovdqu ymm1, YMMWORD PTR [rdx+4000] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_14: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_14 + mov cl, r10b + sub cl, 224 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_14 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_14 +L_mldsa_use_hint_88_avx2_hints_done_3_14: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+3968], ymm4 + vmovdqu YMMWORD PTR [rdx+4000], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+4032] + vmovdqu ymm1, YMMWORD PTR [rdx+4064] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpmulld ymm4, ymm0, ymm12 + vpmulld ymm5, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm11 + vpaddd ymm5, ymm5, ymm11 + vpsrld ymm4, ymm4, 23 + vpsrld ymm5, ymm5, 23 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpcmpeqd ymm0, ymm4, ymm12 + vpcmpeqd ymm1, ymm5, ymm12 + vpaddd ymm2, ymm2, ymm0 + vpaddd ymm3, ymm3, ymm1 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + mov r11, 1 + xor r12, r12 + xor rcx, rcx +L_mldsa_use_hint_88_avx2_hints_next_3_15: + cmp r9b, r8b + jge L_mldsa_use_hint_88_avx2_hints_done_3_15 + mov cl, r10b + sub cl, 240 + cmp rcx, 16 + jge L_mldsa_use_hint_88_avx2_hints_done_3_15 + mov r13, r11 + shl r13, cl + or r12, r13 + inc r9b + mov r10b, [rax+r9] + jmp L_mldsa_use_hint_88_avx2_hints_next_3_15 +L_mldsa_use_hint_88_avx2_hints_done_3_15: + movd xmm6, r12d + shr r12, 8 + movd xmm7, r12d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpcmpgtd ymm0, ymm12, ymm4 + vpcmpgtd ymm1, ymm12, ymm5 + vpand ymm4, ymm4, ymm0 + vpand ymm5, ymm5, ymm1 + vpsrad ymm0, ymm4, 31 + vpsrad ymm1, ymm5, 31 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpaddd ymm4, ymm4, ymm0 + vpaddd ymm5, ymm5, ymm1 + vmovdqu YMMWORD PTR [rdx+4032], ymm4 + vmovdqu YMMWORD PTR [rdx+4064], ymm5 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + pop r13 + pop r12 + ret +wc_mldsa_use_hint_88_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_32_avx2_q DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h + DWORD 007fe001h, 007fe001h, 007fe001h, 007fe001h +ptr_L_mldsa_use_hint_32_avx2_q QWORD L_mldsa_use_hint_32_avx2_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_32_avx2_q_low_32 DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h + DWORD 0003ff00h, 0003ff00h, 0003ff00h, 0003ff00h +ptr_L_mldsa_use_hint_32_avx2_q_low_32 QWORD L_mldsa_use_hint_32_avx2_q_low_32 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_32_avx2_q_low_32_2 DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h + DWORD 0007fe00h, 0007fe00h, 0007fe00h, 0007fe00h +ptr_L_mldsa_use_hint_32_avx2_q_low_32_2 QWORD L_mldsa_use_hint_32_avx2_q_low_32_2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_32_avx2_q_low_32_m1 DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh + DWORD 0003feffh, 0003feffh, 0003feffh, 0003feffh +ptr_L_mldsa_use_hint_32_avx2_q_low_32_m1 QWORD L_mldsa_use_hint_32_avx2_q_low_32_m1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_32_avx2_mask DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh + DWORD 0000000fh, 0000000fh, 0000000fh, 0000000fh +ptr_L_mldsa_use_hint_32_avx2_mask QWORD L_mldsa_use_hint_32_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_32_avx2_vsl DWORD 0000001fh, 0000001eh, 0000001dh, 0000001ch + DWORD 0000001bh, 0000001ah, 00000019h, 00000018h +ptr_L_mldsa_use_hint_32_avx2_vsl QWORD L_mldsa_use_hint_32_avx2_vsl +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mldsa_use_hint_32_avx2_one DWORD 00000001h, 00000001h, 00000001h, 00000001h + DWORD 00000001h, 00000001h, 00000001h, 00000001h +ptr_L_mldsa_use_hint_32_avx2_one QWORD L_mldsa_use_hint_32_avx2_one +_DATA ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_use_hint_32_avx2 PROC + push r12 + push r13 + push r14 + push r15 + mov rax, rdx + mov rdx, rcx + sub rsp, 144 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm8, YMMWORD PTR L_mldsa_use_hint_32_avx2_q + vmovdqu ymm9, YMMWORD PTR L_mldsa_use_hint_32_avx2_q_low_32 + vmovdqu ymm10, YMMWORD PTR L_mldsa_use_hint_32_avx2_q_low_32_2 + vmovdqu ymm11, YMMWORD PTR L_mldsa_use_hint_32_avx2_q_low_32_m1 + vmovdqu ymm12, YMMWORD PTR L_mldsa_use_hint_32_avx2_mask + vmovdqu ymm13, YMMWORD PTR L_mldsa_use_hint_32_avx2_vsl + vmovdqu ymm14, YMMWORD PTR L_mldsa_use_hint_32_avx2_one + xor r10, r10 + mov r11b, [r8] + imul r15, rax, 10 + sub r15, 5 +L_mldsa_use_hint_32_avx2_start_256: + mov r9b, [r8+r15] + inc r15 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__0: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__0 + mov cl, r11b + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__0 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__0 +L_mldsa_use_hint_32_avx2_hints_done__0: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx], ymm4 + vmovdqu YMMWORD PTR [rdx+32], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm1, YMMWORD PTR [rdx+96] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__1: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__1 + mov cl, r11b + sub cl, 16 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__1 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__1 +L_mldsa_use_hint_32_avx2_hints_done__1: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+64], ymm4 + vmovdqu YMMWORD PTR [rdx+96], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm1, YMMWORD PTR [rdx+160] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__2: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__2 + mov cl, r11b + sub cl, 32 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__2 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__2 +L_mldsa_use_hint_32_avx2_hints_done__2: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + vmovdqu YMMWORD PTR [rdx+160], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm1, YMMWORD PTR [rdx+224] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__3: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__3 + mov cl, r11b + sub cl, 48 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__3 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__3 +L_mldsa_use_hint_32_avx2_hints_done__3: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+192], ymm4 + vmovdqu YMMWORD PTR [rdx+224], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__4: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__4 + mov cl, r11b + sub cl, 64 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__4 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__4 +L_mldsa_use_hint_32_avx2_hints_done__4: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+256], ymm4 + vmovdqu YMMWORD PTR [rdx+288], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm1, YMMWORD PTR [rdx+352] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__5: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__5 + mov cl, r11b + sub cl, 80 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__5 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__5 +L_mldsa_use_hint_32_avx2_hints_done__5: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+320], ymm4 + vmovdqu YMMWORD PTR [rdx+352], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm1, YMMWORD PTR [rdx+416] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__6: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__6 + mov cl, r11b + sub cl, 96 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__6 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__6 +L_mldsa_use_hint_32_avx2_hints_done__6: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + vmovdqu YMMWORD PTR [rdx+416], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm1, YMMWORD PTR [rdx+480] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__7: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__7 + mov cl, r11b + sub cl, 112 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__7 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__7 +L_mldsa_use_hint_32_avx2_hints_done__7: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+448], ymm4 + vmovdqu YMMWORD PTR [rdx+480], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+512] + vmovdqu ymm1, YMMWORD PTR [rdx+544] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__8: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__8 + mov cl, r11b + sub cl, 128 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__8 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__8 +L_mldsa_use_hint_32_avx2_hints_done__8: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+512], ymm4 + vmovdqu YMMWORD PTR [rdx+544], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+576] + vmovdqu ymm1, YMMWORD PTR [rdx+608] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__9: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__9 + mov cl, r11b + sub cl, 144 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__9 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__9 +L_mldsa_use_hint_32_avx2_hints_done__9: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+576], ymm4 + vmovdqu YMMWORD PTR [rdx+608], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+640] + vmovdqu ymm1, YMMWORD PTR [rdx+672] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__10: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__10 + mov cl, r11b + sub cl, 160 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__10 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__10 +L_mldsa_use_hint_32_avx2_hints_done__10: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+640], ymm4 + vmovdqu YMMWORD PTR [rdx+672], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+704] + vmovdqu ymm1, YMMWORD PTR [rdx+736] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__11: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__11 + mov cl, r11b + sub cl, 176 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__11 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__11 +L_mldsa_use_hint_32_avx2_hints_done__11: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+704], ymm4 + vmovdqu YMMWORD PTR [rdx+736], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+768] + vmovdqu ymm1, YMMWORD PTR [rdx+800] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__12: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__12 + mov cl, r11b + sub cl, 192 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__12 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__12 +L_mldsa_use_hint_32_avx2_hints_done__12: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+768], ymm4 + vmovdqu YMMWORD PTR [rdx+800], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+832] + vmovdqu ymm1, YMMWORD PTR [rdx+864] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__13: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__13 + mov cl, r11b + sub cl, 208 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__13 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__13 +L_mldsa_use_hint_32_avx2_hints_done__13: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+832], ymm4 + vmovdqu YMMWORD PTR [rdx+864], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+896] + vmovdqu ymm1, YMMWORD PTR [rdx+928] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__14: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__14 + mov cl, r11b + sub cl, 224 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__14 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__14 +L_mldsa_use_hint_32_avx2_hints_done__14: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+896], ymm4 + vmovdqu YMMWORD PTR [rdx+928], ymm5 + vmovdqu ymm0, YMMWORD PTR [rdx+960] + vmovdqu ymm1, YMMWORD PTR [rdx+992] + vpsrad ymm2, ymm0, 31 + vpsrad ymm3, ymm1, 31 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm4, ymm0, ymm11 + vpaddd ymm5, ymm1, ymm11 + vpsrld ymm4, ymm4, 19 + vpsrld ymm5, ymm5, 19 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsubd ymm2, ymm9, ymm2 + vpsubd ymm3, ymm9, ymm3 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpmulld ymm2, ymm4, ymm10 + vpmulld ymm3, ymm5, ymm10 + vpsubd ymm2, ymm0, ymm2 + vpsubd ymm3, ymm1, ymm3 + vpsrld ymm0, ymm4, 4 + vpsrld ymm1, ymm5, 4 + vpsubd ymm2, ymm2, ymm0 + vpsubd ymm3, ymm3, ymm1 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + mov r12, 1 + xor r13, r13 + xor rcx, rcx +L_mldsa_use_hint_32_avx2_hints_next__15: + cmp r10b, r9b + jge L_mldsa_use_hint_32_avx2_hints_done__15 + mov cl, r11b + sub cl, 240 + cmp rcx, 16 + jge L_mldsa_use_hint_32_avx2_hints_done__15 + mov r14, r12 + shl r14, cl + or r13, r14 + inc r10b + mov r11b, [r8+r10] + jmp L_mldsa_use_hint_32_avx2_hints_next__15 +L_mldsa_use_hint_32_avx2_hints_done__15: + movd xmm6, r13d + shr r13, 8 + movd xmm7, r13d + vpbroadcastd ymm6, xmm6 + vpbroadcastd ymm7, xmm7 + vpsllvd ymm6, ymm6, ymm13 + vpsllvd ymm7, ymm7, ymm13 + vpsrad ymm6, ymm6, 31 + vpsrad ymm7, ymm7, 31 + vpsrld ymm2, ymm2, 31 + vpsrld ymm3, ymm3, 31 + vpslld ymm2, ymm2, 1 + vpslld ymm3, ymm3, 1 + vpsubd ymm2, ymm14, ymm2 + vpsubd ymm3, ymm14, ymm3 + vpand ymm2, ymm2, ymm6 + vpand ymm3, ymm3, ymm7 + vpaddd ymm4, ymm4, ymm2 + vpaddd ymm5, ymm5, ymm3 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vmovdqu YMMWORD PTR [rdx+960], ymm4 + vmovdqu YMMWORD PTR [rdx+992], ymm5 + add rdx, 1024 + sub rax, 1 + jne L_mldsa_use_hint_32_avx2_start_256 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + pop r15 + pop r14 + pop r13 + pop r12 + ret +wc_mldsa_use_hint_32_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_vec_check_low_avx2 PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + sub r8d, 1 + movd xmm2, r8d + neg r8d + movd xmm3, r8d + vpbroadcastd ymm2, xmm2 + vpbroadcastd ymm3, xmm3 +L_mldsa_vec_check_low_vx2_start_256: + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+96] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+192] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+320] + vmovdqu ymm1, YMMWORD PTR [rcx+352] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+448] + vmovdqu ymm1, YMMWORD PTR [rcx+480] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+576] + vmovdqu ymm1, YMMWORD PTR [rcx+608] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+704] + vmovdqu ymm1, YMMWORD PTR [rcx+736] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+832] + vmovdqu ymm1, YMMWORD PTR [rcx+864] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+960] + vmovdqu ymm1, YMMWORD PTR [rcx+992] + vpcmpgtd ymm4, ymm0, ymm2 + vpcmpgtd ymm5, ymm1, ymm2 + vpcmpgtd ymm6, ymm3, ymm0 + vpcmpgtd ymm7, ymm3, ymm1 + vpor ymm4, ymm4, ymm5 + vpor ymm6, ymm6, ymm7 + vpor ymm4, ymm4, ymm6 + vpmovmskb rax, ymm4 + cmp rax, 0 + mov rax, 0 + jne L_mldsa_vec_check_low_vx2_done + add rcx, 1024 + sub rdx, 1 + jne L_mldsa_vec_check_low_vx2_start_256 + mov rax, 1 +L_mldsa_vec_check_low_vx2_done: + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +wc_mldsa_vec_check_low_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_poly_add_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vmovdqu ymm8, YMMWORD PTR [rdx] + vmovdqu ymm9, YMMWORD PTR [rdx+32] + vmovdqu ymm10, YMMWORD PTR [rdx+64] + vmovdqu ymm11, YMMWORD PTR [rdx+96] + vmovdqu ymm12, YMMWORD PTR [rdx+128] + vmovdqu ymm13, YMMWORD PTR [rdx+160] + vmovdqu ymm14, YMMWORD PTR [rdx+192] + vmovdqu ymm15, YMMWORD PTR [rdx+224] + vpaddd ymm0, ymm0, ymm8 + vpaddd ymm1, ymm1, ymm9 + vpaddd ymm2, ymm2, ymm10 + vpaddd ymm3, ymm3, ymm11 + vpaddd ymm4, ymm4, ymm12 + vpaddd ymm5, ymm5, ymm13 + vpaddd ymm6, ymm6, ymm14 + vpaddd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vmovdqu ymm8, YMMWORD PTR [rdx+256] + vmovdqu ymm9, YMMWORD PTR [rdx+288] + vmovdqu ymm10, YMMWORD PTR [rdx+320] + vmovdqu ymm11, YMMWORD PTR [rdx+352] + vmovdqu ymm12, YMMWORD PTR [rdx+384] + vmovdqu ymm13, YMMWORD PTR [rdx+416] + vmovdqu ymm14, YMMWORD PTR [rdx+448] + vmovdqu ymm15, YMMWORD PTR [rdx+480] + vpaddd ymm0, ymm0, ymm8 + vpaddd ymm1, ymm1, ymm9 + vpaddd ymm2, ymm2, ymm10 + vpaddd ymm3, ymm3, ymm11 + vpaddd ymm4, ymm4, ymm12 + vpaddd ymm5, ymm5, ymm13 + vpaddd ymm6, ymm6, ymm14 + vpaddd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + vmovdqu ymm8, YMMWORD PTR [rdx+512] + vmovdqu ymm9, YMMWORD PTR [rdx+544] + vmovdqu ymm10, YMMWORD PTR [rdx+576] + vmovdqu ymm11, YMMWORD PTR [rdx+608] + vmovdqu ymm12, YMMWORD PTR [rdx+640] + vmovdqu ymm13, YMMWORD PTR [rdx+672] + vmovdqu ymm14, YMMWORD PTR [rdx+704] + vmovdqu ymm15, YMMWORD PTR [rdx+736] + vpaddd ymm0, ymm0, ymm8 + vpaddd ymm1, ymm1, ymm9 + vpaddd ymm2, ymm2, ymm10 + vpaddd ymm3, ymm3, ymm11 + vpaddd ymm4, ymm4, ymm12 + vpaddd ymm5, ymm5, ymm13 + vpaddd ymm6, ymm6, ymm14 + vpaddd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+704], ymm6 + vmovdqu YMMWORD PTR [rcx+736], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vmovdqu ymm8, YMMWORD PTR [rdx+768] + vmovdqu ymm9, YMMWORD PTR [rdx+800] + vmovdqu ymm10, YMMWORD PTR [rdx+832] + vmovdqu ymm11, YMMWORD PTR [rdx+864] + vmovdqu ymm12, YMMWORD PTR [rdx+896] + vmovdqu ymm13, YMMWORD PTR [rdx+928] + vmovdqu ymm14, YMMWORD PTR [rdx+960] + vmovdqu ymm15, YMMWORD PTR [rdx+992] + vpaddd ymm0, ymm0, ymm8 + vpaddd ymm1, ymm1, ymm9 + vpaddd ymm2, ymm2, ymm10 + vpaddd ymm3, ymm3, ymm11 + vpaddd ymm4, ymm4, ymm12 + vpaddd ymm5, ymm5, ymm13 + vpaddd ymm6, ymm6, ymm14 + vpaddd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+864], ymm3 + vmovdqu YMMWORD PTR [rcx+896], ymm4 + vmovdqu YMMWORD PTR [rcx+928], ymm5 + vmovdqu YMMWORD PTR [rcx+960], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_poly_add_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_poly_sub_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vmovdqu ymm8, YMMWORD PTR [rdx] + vmovdqu ymm9, YMMWORD PTR [rdx+32] + vmovdqu ymm10, YMMWORD PTR [rdx+64] + vmovdqu ymm11, YMMWORD PTR [rdx+96] + vmovdqu ymm12, YMMWORD PTR [rdx+128] + vmovdqu ymm13, YMMWORD PTR [rdx+160] + vmovdqu ymm14, YMMWORD PTR [rdx+192] + vmovdqu ymm15, YMMWORD PTR [rdx+224] + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpsubd ymm2, ymm2, ymm10 + vpsubd ymm3, ymm3, ymm11 + vpsubd ymm4, ymm4, ymm12 + vpsubd ymm5, ymm5, ymm13 + vpsubd ymm6, ymm6, ymm14 + vpsubd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vmovdqu ymm8, YMMWORD PTR [rdx+256] + vmovdqu ymm9, YMMWORD PTR [rdx+288] + vmovdqu ymm10, YMMWORD PTR [rdx+320] + vmovdqu ymm11, YMMWORD PTR [rdx+352] + vmovdqu ymm12, YMMWORD PTR [rdx+384] + vmovdqu ymm13, YMMWORD PTR [rdx+416] + vmovdqu ymm14, YMMWORD PTR [rdx+448] + vmovdqu ymm15, YMMWORD PTR [rdx+480] + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpsubd ymm2, ymm2, ymm10 + vpsubd ymm3, ymm3, ymm11 + vpsubd ymm4, ymm4, ymm12 + vpsubd ymm5, ymm5, ymm13 + vpsubd ymm6, ymm6, ymm14 + vpsubd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vmovdqu ymm4, YMMWORD PTR [rcx+640] + vmovdqu ymm5, YMMWORD PTR [rcx+672] + vmovdqu ymm6, YMMWORD PTR [rcx+704] + vmovdqu ymm7, YMMWORD PTR [rcx+736] + vmovdqu ymm8, YMMWORD PTR [rdx+512] + vmovdqu ymm9, YMMWORD PTR [rdx+544] + vmovdqu ymm10, YMMWORD PTR [rdx+576] + vmovdqu ymm11, YMMWORD PTR [rdx+608] + vmovdqu ymm12, YMMWORD PTR [rdx+640] + vmovdqu ymm13, YMMWORD PTR [rdx+672] + vmovdqu ymm14, YMMWORD PTR [rdx+704] + vmovdqu ymm15, YMMWORD PTR [rdx+736] + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpsubd ymm2, ymm2, ymm10 + vpsubd ymm3, ymm3, ymm11 + vpsubd ymm4, ymm4, ymm12 + vpsubd ymm5, ymm5, ymm13 + vpsubd ymm6, ymm6, ymm14 + vpsubd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu YMMWORD PTR [rcx+640], ymm4 + vmovdqu YMMWORD PTR [rcx+672], ymm5 + vmovdqu YMMWORD PTR [rcx+704], ymm6 + vmovdqu YMMWORD PTR [rcx+736], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vmovdqu ymm4, YMMWORD PTR [rcx+896] + vmovdqu ymm5, YMMWORD PTR [rcx+928] + vmovdqu ymm6, YMMWORD PTR [rcx+960] + vmovdqu ymm7, YMMWORD PTR [rcx+992] + vmovdqu ymm8, YMMWORD PTR [rdx+768] + vmovdqu ymm9, YMMWORD PTR [rdx+800] + vmovdqu ymm10, YMMWORD PTR [rdx+832] + vmovdqu ymm11, YMMWORD PTR [rdx+864] + vmovdqu ymm12, YMMWORD PTR [rdx+896] + vmovdqu ymm13, YMMWORD PTR [rdx+928] + vmovdqu ymm14, YMMWORD PTR [rdx+960] + vmovdqu ymm15, YMMWORD PTR [rdx+992] + vpsubd ymm0, ymm0, ymm8 + vpsubd ymm1, ymm1, ymm9 + vpsubd ymm2, ymm2, ymm10 + vpsubd ymm3, ymm3, ymm11 + vpsubd ymm4, ymm4, ymm12 + vpsubd ymm5, ymm5, ymm13 + vpsubd ymm6, ymm6, ymm14 + vpsubd ymm7, ymm7, ymm15 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+864], ymm3 + vmovdqu YMMWORD PTR [rcx+896], ymm4 + vmovdqu YMMWORD PTR [rcx+928], ymm5 + vmovdqu YMMWORD PTR [rcx+960], ymm6 + vmovdqu YMMWORD PTR [rcx+992], ymm7 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +wc_mldsa_poly_sub_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +wc_mldsa_poly_make_pos_avx2 PROC + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vpxor ymm8, ymm8, ymm8 + vmovdqu ymm9, YMMWORD PTR mldsa_q + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm2 + vmovdqu YMMWORD PTR [rcx+224], ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + vmovdqu YMMWORD PTR [rcx+416], ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vmovdqu ymm2, YMMWORD PTR [rcx+576] + vmovdqu ymm3, YMMWORD PTR [rcx+608] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+512], ymm0 + vmovdqu YMMWORD PTR [rcx+544], ymm1 + vmovdqu YMMWORD PTR [rcx+576], ymm2 + vmovdqu YMMWORD PTR [rcx+608], ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vmovdqu ymm2, YMMWORD PTR [rcx+704] + vmovdqu ymm3, YMMWORD PTR [rcx+736] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+640], ymm0 + vmovdqu YMMWORD PTR [rcx+672], ymm1 + vmovdqu YMMWORD PTR [rcx+704], ymm2 + vmovdqu YMMWORD PTR [rcx+736], ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vmovdqu ymm2, YMMWORD PTR [rcx+832] + vmovdqu ymm3, YMMWORD PTR [rcx+864] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+768], ymm0 + vmovdqu YMMWORD PTR [rcx+800], ymm1 + vmovdqu YMMWORD PTR [rcx+832], ymm2 + vmovdqu YMMWORD PTR [rcx+864], ymm3 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vmovdqu ymm2, YMMWORD PTR [rcx+960] + vmovdqu ymm3, YMMWORD PTR [rcx+992] + vpcmpgtd ymm4, ymm8, ymm0 + vpcmpgtd ymm5, ymm8, ymm1 + vpcmpgtd ymm6, ymm8, ymm2 + vpcmpgtd ymm7, ymm8, ymm3 + vpand ymm4, ymm4, ymm9 + vpand ymm5, ymm5, ymm9 + vpand ymm6, ymm6, ymm9 + vpand ymm7, ymm7, ymm9 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+896], ymm0 + vmovdqu YMMWORD PTR [rcx+928], ymm1 + vmovdqu YMMWORD PTR [rcx+960], ymm2 + vmovdqu YMMWORD PTR [rcx+992], ymm3 + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + ret +wc_mldsa_poly_make_pos_avx2 ENDP +_TEXT ENDS +ENDIF +ENDIF +END diff --git a/wolfcrypt/src/wc_mlkem_asm.asm b/wolfcrypt/src/wc_mlkem_asm.asm new file mode 100644 index 00000000000..62743aa846d --- /dev/null +++ b/wolfcrypt/src/wc_mlkem_asm.asm @@ -0,0 +1,15435 @@ +; /* wc_mlkem_asm.asm */ +; /* +; * Copyright (C) 2006-2026 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 3 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ + +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +IFDEF WOLFSSL_HAVE_MLKEM +IFDEF HAVE_INTEL_AVX2 +_DATA SEGMENT +ALIGN 16 +mlkem_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h + WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h +ptr_mlkem_q QWORD mlkem_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +mlkem_qinv WORD 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h + WORD 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h, 0f301h +ptr_mlkem_qinv QWORD mlkem_qinv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +mlkem_f WORD 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h + WORD 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h, 0549h +ptr_mlkem_f QWORD mlkem_f +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +mlkem_f_qinv WORD 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h + WORD 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h, 5049h +ptr_mlkem_f_qinv QWORD mlkem_f_qinv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +mlkem_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh + WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh +ptr_mlkem_v QWORD mlkem_v +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_avx2_zetas WORD 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh + WORD 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh, 0a0bh + WORD 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh + WORD 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh, 7b0bh + WORD 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah + WORD 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah, 0b9ah + WORD 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah + WORD 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah, 399ah + WORD 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h + WORD 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h, 05d5h + WORD 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h + WORD 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h, 34d5h + WORD 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh + WORD 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh, 058eh + WORD 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh + WORD 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh, 0cf8eh + WORD 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h + WORD 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h, 0c56h + WORD 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h + WORD 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h, 0ae56h + WORD 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh + WORD 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh, 026eh + WORD 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh + WORD 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh, 6c6eh + WORD 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h + WORD 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h, 0629h + WORD 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h + WORD 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h, 0f129h + WORD 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h + WORD 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h, 00b6h + WORD 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h + WORD 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h, 0c2b6h + WORD 023dh, 023dh, 023dh, 023dh, 023dh, 023dh, 023dh, 023dh + WORD 07d4h, 07d4h, 07d4h, 07d4h, 07d4h, 07d4h, 07d4h, 07d4h + WORD 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh, 0e93dh + WORD 43d4h, 43d4h, 43d4h, 43d4h, 43d4h, 43d4h, 43d4h, 43d4h + WORD 0108h, 0108h, 0108h, 0108h, 0108h, 0108h, 0108h, 0108h + WORD 017fh, 017fh, 017fh, 017fh, 017fh, 017fh, 017fh, 017fh + WORD 9908h, 9908h, 9908h, 9908h, 9908h, 9908h, 9908h, 9908h + WORD 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh, 8e7fh + WORD 04c7h, 04c7h, 04c7h, 04c7h, 028ch, 028ch, 028ch, 028ch + WORD 0ad9h, 0ad9h, 0ad9h, 0ad9h, 03f7h, 03f7h, 03f7h, 03f7h + WORD 0e9c7h, 0e9c7h, 0e9c7h, 0e9c7h, 0e68ch, 0e68ch, 0e68ch, 0e68ch + WORD 05d9h, 05d9h, 05d9h, 05d9h, 78f7h, 78f7h, 78f7h, 78f7h + WORD 07f4h, 07f4h, 07f4h, 07f4h, 05d3h, 05d3h, 05d3h, 05d3h + WORD 0be7h, 0be7h, 0be7h, 0be7h, 06f9h, 06f9h, 06f9h, 06f9h + WORD 0a3f4h, 0a3f4h, 0a3f4h, 0a3f4h, 4ed3h, 4ed3h, 4ed3h, 4ed3h + WORD 50e7h, 50e7h, 50e7h, 50e7h, 61f9h, 61f9h, 61f9h, 61f9h + WORD 09c4h, 09c4h, 09c4h, 09c4h, 09c4h, 09c4h, 09c4h, 09c4h + WORD 05b2h, 05b2h, 05b2h, 05b2h, 05b2h, 05b2h, 05b2h, 05b2h + WORD 15c4h, 15c4h, 15c4h, 15c4h, 15c4h, 15c4h, 15c4h, 15c4h + WORD 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h, 0fbb2h + WORD 06bfh, 06bfh, 06bfh, 06bfh, 06bfh, 06bfh, 06bfh, 06bfh + WORD 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh, 0c7fh + WORD 53bfh, 53bfh, 53bfh, 53bfh, 53bfh, 53bfh, 53bfh, 53bfh + WORD 997fh, 997fh, 997fh, 997fh, 997fh, 997fh, 997fh, 997fh + WORD 0204h, 0204h, 0204h, 0204h, 0cf9h, 0cf9h, 0cf9h, 0cf9h + WORD 0bc1h, 0bc1h, 0bc1h, 0bc1h, 0a67h, 0a67h, 0a67h, 0a67h + WORD 0ce04h, 0ce04h, 0ce04h, 0ce04h, 67f9h, 67f9h, 67f9h, 67f9h + WORD 3ec1h, 3ec1h, 3ec1h, 3ec1h, 0cf67h, 0cf67h, 0cf67h, 0cf67h + WORD 06afh, 06afh, 06afh, 06afh, 0877h, 0877h, 0877h, 0877h + WORD 007eh, 007eh, 007eh, 007eh, 05bdh, 05bdh, 05bdh, 05bdh + WORD 23afh, 23afh, 23afh, 23afh, 0fd77h, 0fd77h, 0fd77h, 0fd77h + WORD 9a7eh, 9a7eh, 9a7eh, 9a7eh, 6cbdh, 6cbdh, 6cbdh, 6cbdh + WORD 08b2h, 08b2h, 01aeh, 01aeh, 022bh, 022bh, 034bh, 034bh + WORD 081eh, 081eh, 0367h, 0367h, 060eh, 060eh, 0069h, 0069h + WORD 0feb2h, 0feb2h, 2baeh, 2baeh, 0d32bh, 0d32bh, 344bh, 344bh + WORD 821eh, 821eh, 0c867h, 0c867h, 500eh, 500eh, 0ab69h, 0ab69h + WORD 01a6h, 01a6h, 024bh, 024bh, 00b1h, 00b1h, 0c16h, 0c16h + WORD 0bdeh, 0bdeh, 0b35h, 0b35h, 0626h, 0626h, 0675h, 0675h + WORD 93a6h, 93a6h, 334bh, 334bh, 03b1h, 03b1h, 0ee16h, 0ee16h + WORD 0c5deh, 0c5deh, 5a35h, 5a35h, 1826h, 1826h, 1575h, 1575h + WORD 0c0bh, 0c0bh, 030ah, 030ah, 0487h, 0487h, 0c6eh, 0c6eh + WORD 09f8h, 09f8h, 05cbh, 05cbh, 0aa7h, 0aa7h, 045fh, 045fh + WORD 7d0bh, 7d0bh, 810ah, 810ah, 2987h, 2987h, 766eh, 766eh + WORD 71f8h, 71f8h, 0b6cbh, 0b6cbh, 8fa7h, 8fa7h, 315fh, 315fh + WORD 06cbh, 06cbh, 0284h, 0284h, 0999h, 0999h, 015dh, 015dh + WORD 01a2h, 01a2h, 0149h, 0149h, 0c65h, 0c65h, 0cb6h, 0cb6h + WORD 0b7cbh, 0b7cbh, 4e84h, 4e84h, 4499h, 4499h, 485dh, 485dh + WORD 0c7a2h, 0c7a2h, 4c49h, 4c49h, 0eb65h, 0eb65h, 0ceb6h, 0ceb6h + WORD 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h + WORD 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h, 0714h + WORD 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h + WORD 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h, 0314h + WORD 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh + WORD 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh, 011fh + WORD 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh + WORD 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh, 6e1fh + WORD 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah + WORD 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah, 00cah + WORD 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah + WORD 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah, 0becah + WORD 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h + WORD 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h, 03c2h + WORD 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h + WORD 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h, 29c2h + WORD 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh + WORD 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh, 084fh + WORD 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh + WORD 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh, 054fh + WORD 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh + WORD 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh, 073fh + WORD 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh + WORD 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh, 0d43fh + WORD 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch + WORD 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch, 05bch + WORD 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch + WORD 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch, 79bch + WORD 0a58h, 0a58h, 0a58h, 0a58h, 0a58h, 0a58h, 0a58h, 0a58h + WORD 03f9h, 03f9h, 03f9h, 03f9h, 03f9h, 03f9h, 03f9h, 03f9h + WORD 9258h, 9258h, 9258h, 9258h, 9258h, 9258h, 9258h, 9258h + WORD 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h, 5ef9h + WORD 02dch, 02dch, 02dch, 02dch, 02dch, 02dch, 02dch, 02dch + WORD 0260h, 0260h, 0260h, 0260h, 0260h, 0260h, 0260h, 0260h + WORD 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch, 0d6dch + WORD 2260h, 2260h, 2260h, 2260h, 2260h, 2260h, 2260h, 2260h + WORD 09ach, 09ach, 09ach, 09ach, 0ca7h, 0ca7h, 0ca7h, 0ca7h + WORD 0bf2h, 0bf2h, 0bf2h, 0bf2h, 033eh, 033eh, 033eh, 033eh + WORD 4dach, 4dach, 4dach, 4dach, 91a7h, 91a7h, 91a7h, 91a7h + WORD 0c1f2h, 0c1f2h, 0c1f2h, 0c1f2h, 0dd3eh, 0dd3eh, 0dd3eh, 0dd3eh + WORD 006bh, 006bh, 006bh, 006bh, 0774h, 0774h, 0774h, 0774h + WORD 0c0ah, 0c0ah, 0c0ah, 0c0ah, 094ah, 094ah, 094ah, 094ah + WORD 916bh, 916bh, 916bh, 916bh, 2374h, 2374h, 2374h, 2374h + WORD 8a0ah, 8a0ah, 8a0ah, 8a0ah, 474ah, 474ah, 474ah, 474ah + WORD 06fbh, 06fbh, 06fbh, 06fbh, 06fbh, 06fbh, 06fbh, 06fbh + WORD 019bh, 019bh, 019bh, 019bh, 019bh, 019bh, 019bh, 019bh + WORD 47fbh, 47fbh, 47fbh, 47fbh, 47fbh, 47fbh, 47fbh, 47fbh + WORD 229bh, 229bh, 229bh, 229bh, 229bh, 229bh, 229bh, 229bh + WORD 0c34h, 0c34h, 0c34h, 0c34h, 0c34h, 0c34h, 0c34h, 0c34h + WORD 06deh, 06deh, 06deh, 06deh, 06deh, 06deh, 06deh, 06deh + WORD 6834h, 6834h, 6834h, 6834h, 6834h, 6834h, 6834h, 6834h + WORD 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh, 0c0deh + WORD 0b73h, 0b73h, 0b73h, 0b73h, 03c1h, 03c1h, 03c1h, 03c1h + WORD 071dh, 071dh, 071dh, 071dh, 0a2ch, 0a2ch, 0a2ch, 0a2ch + WORD 3473h, 3473h, 3473h, 3473h, 36c1h, 36c1h, 36c1h, 36c1h + WORD 8e1dh, 8e1dh, 8e1dh, 8e1dh, 0ce2ch, 0ce2ch, 0ce2ch, 0ce2ch + WORD 01c0h, 01c0h, 01c0h, 01c0h, 08d8h, 08d8h, 08d8h, 08d8h + WORD 02a5h, 02a5h, 02a5h, 02a5h, 0806h, 0806h, 0806h, 0806h + WORD 41c0h, 41c0h, 41c0h, 41c0h, 10d8h, 10d8h, 10d8h, 10d8h + WORD 0a1a5h, 0a1a5h, 0a1a5h, 0a1a5h, 0ba06h, 0ba06h, 0ba06h, 0ba06h + WORD 0331h, 0331h, 0449h, 0449h, 025bh, 025bh, 0262h, 0262h + WORD 052ah, 052ah, 07fch, 07fch, 0748h, 0748h, 0180h, 0180h + WORD 8631h, 8631h, 4f49h, 4f49h, 635bh, 635bh, 0862h, 0862h + WORD 0e32ah, 0e32ah, 3bfch, 3bfch, 5f48h, 5f48h, 8180h, 8180h + WORD 0842h, 0842h, 0c79h, 0c79h, 04c2h, 04c2h, 07cah, 07cah + WORD 0997h, 0997h, 00dch, 00dch, 085eh, 085eh, 0686h, 0686h + WORD 0ae42h, 0ae42h, 0e779h, 0e779h, 2ac2h, 2ac2h, 0c5cah, 0c5cah + WORD 5e97h, 5e97h, 0d4dch, 0d4dch, 425eh, 425eh, 3886h, 3886h + WORD 0860h, 0860h, 0707h, 0707h, 0803h, 0803h, 031ah, 031ah + WORD 071bh, 071bh, 09abh, 09abh, 099bh, 099bh, 01deh, 01deh + WORD 2860h, 2860h, 0ac07h, 0ac07h, 0e103h, 0e103h, 0b11ah, 0b11ah + WORD 0a81bh, 0a81bh, 5aabh, 5aabh, 2a9bh, 2a9bh, 0bbdeh, 0bbdeh + WORD 0c95h, 0c95h, 0bcdh, 0bcdh, 03e4h, 03e4h, 03dfh, 03dfh + WORD 03beh, 03beh, 074dh, 074dh, 05f2h, 05f2h, 065ch, 065ch + WORD 7b95h, 7b95h, 0a2cdh, 0a2cdh, 6fe4h, 6fe4h, 0b0dfh, 0b0dfh + WORD 5dbeh, 5dbeh, 1e4dh, 1e4dh, 0bbf2h, 0bbf2h, 5a5ch, 5a5ch +ptr_L_mlkem_avx2_zetas QWORD L_mlkem_avx2_zetas +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_avx2_zetas_basemul WORD 08b2h, 081eh, 0f74eh, 0f7e2h, 01aeh, 0367h, 0fe52h, 0fc99h + WORD 022bh, 060eh, 0fdd5h, 0f9f2h, 034bh, 0069h, 0fcb5h, 0ff97h + WORD 0feb2h, 821eh, 014eh, 7de2h, 2baeh, 0c867h, 0d452h, 3799h + WORD 0d32bh, 500eh, 2cd5h, 0aff2h, 344bh, 0ab69h, 0cbb5h, 5497h + WORD 01a6h, 0bdeh, 0fe5ah, 0f422h, 024bh, 0b35h, 0fdb5h, 0f4cbh + WORD 00b1h, 0626h, 0ff4fh, 0f9dah, 0c16h, 0675h, 0f3eah, 0f98bh + WORD 93a6h, 0c5deh, 6c5ah, 3a22h, 334bh, 5a35h, 0ccb5h, 0a5cbh + WORD 03b1h, 1826h, 0fc4fh, 0e7dah, 0ee16h, 1575h, 11eah, 0ea8bh + WORD 0c0bh, 09f8h, 0f3f5h, 0f608h, 030ah, 05cbh, 0fcf6h, 0fa35h + WORD 0487h, 0aa7h, 0fb79h, 0f559h, 0c6eh, 045fh, 0f392h, 0fba1h + WORD 7d0bh, 71f8h, 82f5h, 8e08h, 810ah, 0b6cbh, 7ef6h, 4935h + WORD 2987h, 8fa7h, 0d679h, 7059h, 766eh, 315fh, 8992h, 0cea1h + WORD 06cbh, 01a2h, 0f935h, 0fe5eh, 0284h, 0149h, 0fd7ch, 0feb7h + WORD 0999h, 0c65h, 0f667h, 0f39bh, 015dh, 0cb6h, 0fea3h, 0f34ah + WORD 0b7cbh, 0c7a2h, 4835h, 385eh, 4e84h, 4c49h, 0b17ch, 0b3b7h + WORD 4499h, 0eb65h, 0bb67h, 149bh, 485dh, 0ceb6h, 0b7a3h, 314ah + WORD 0331h, 052ah, 0fccfh, 0fad6h, 0449h, 07fch, 0fbb7h, 0f804h + WORD 025bh, 0748h, 0fda5h, 0f8b8h, 0262h, 0180h, 0fd9eh, 0fe80h + WORD 8631h, 0e32ah, 79cfh, 1cd6h, 4f49h, 3bfch, 0b0b7h, 0c404h + WORD 635bh, 5f48h, 9ca5h, 0a0b8h, 0862h, 8180h, 0f79eh, 7e80h + WORD 0842h, 0997h, 0f7beh, 0f669h, 0c79h, 00dch, 0f387h, 0ff24h + WORD 04c2h, 085eh, 0fb3eh, 0f7a2h, 07cah, 0686h, 0f836h, 0f97ah + WORD 0ae42h, 5e97h, 51beh, 0a169h, 0e779h, 0d4dch, 1887h, 2b24h + WORD 2ac2h, 425eh, 0d53eh, 0bda2h, 0c5cah, 3886h, 3a36h, 0c77ah + WORD 0860h, 071bh, 0f7a0h, 0f8e5h, 0707h, 09abh, 0f8f9h, 0f655h + WORD 0803h, 099bh, 0f7fdh, 0f665h, 031ah, 01deh, 0fce6h, 0fe22h + WORD 2860h, 0a81bh, 0d7a0h, 57e5h, 0ac07h, 5aabh, 53f9h, 0a555h + WORD 0e103h, 2a9bh, 1efdh, 0d565h, 0b11ah, 0bbdeh, 4ee6h, 4422h + WORD 0c95h, 03beh, 0f36bh, 0fc42h, 0bcdh, 074dh, 0f433h, 0f8b3h + WORD 03e4h, 05f2h, 0fc1ch, 0fa0eh, 03dfh, 065ch, 0fc21h, 0f9a4h + WORD 7b95h, 5dbeh, 846bh, 0a242h, 0a2cdh, 1e4dh, 5d33h, 0e1b3h + WORD 6fe4h, 0bbf2h, 901ch, 440eh, 0b0dfh, 5a5ch, 4f21h, 0a5a4h +ptr_L_mlkem_avx2_zetas_basemul QWORD L_mlkem_avx2_zetas_basemul +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_avx2_zetas_inv WORD 06a5h, 06a5h, 05b4h, 05b4h, 070fh, 070fh, 0943h, 0943h + WORD 0922h, 0922h, 0134h, 0134h, 091dh, 091dh, 006ch, 006ch + WORD 0a5a5h, 0a5a5h, 0e1b4h, 0e1b4h, 440fh, 440fh, 0a243h, 0a243h + WORD 4f22h, 4f22h, 5d34h, 5d34h, 901dh, 901dh, 846ch, 846ch + WORD 0b23h, 0b23h, 0356h, 0356h, 0366h, 0366h, 05e6h, 05e6h + WORD 09e7h, 09e7h, 05fah, 05fah, 04feh, 04feh, 04a1h, 04a1h + WORD 4423h, 4423h, 0a556h, 0a556h, 0d566h, 0d566h, 57e6h, 57e6h + WORD 4ee7h, 4ee7h, 53fah, 53fah, 1efeh, 1efeh, 0d7a1h, 0d7a1h + WORD 04fbh, 04fbh, 04fbh, 04fbh, 0a5ch, 0a5ch, 0a5ch, 0a5ch + WORD 0429h, 0429h, 0429h, 0429h, 0b41h, 0b41h, 0b41h, 0b41h + WORD 45fbh, 45fbh, 45fbh, 45fbh, 5e5ch, 5e5ch, 5e5ch, 5e5ch + WORD 0ef29h, 0ef29h, 0ef29h, 0ef29h, 0be41h, 0be41h, 0be41h, 0be41h + WORD 02d5h, 02d5h, 02d5h, 02d5h, 05e4h, 05e4h, 05e4h, 05e4h + WORD 0940h, 0940h, 0940h, 0940h, 018eh, 018eh, 018eh, 018eh + WORD 31d5h, 31d5h, 31d5h, 31d5h, 71e4h, 71e4h, 71e4h, 71e4h + WORD 0c940h, 0c940h, 0c940h, 0c940h, 0cb8eh, 0cb8eh, 0cb8eh, 0cb8eh + WORD 0623h, 0623h, 0623h, 0623h, 0623h, 0623h, 0623h, 0623h + WORD 00cdh, 00cdh, 00cdh, 00cdh, 00cdh, 00cdh, 00cdh, 00cdh + WORD 3f23h, 3f23h, 3f23h, 3f23h, 3f23h, 3f23h, 3f23h, 3f23h + WORD 97cdh, 97cdh, 97cdh, 97cdh, 97cdh, 97cdh, 97cdh, 97cdh + WORD 0b66h, 0b66h, 0b66h, 0b66h, 0b66h, 0b66h, 0b66h, 0b66h + WORD 0606h, 0606h, 0606h, 0606h, 0606h, 0606h, 0606h, 0606h + WORD 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h, 0dd66h + WORD 0b806h, 0b806h, 0b806h, 0b806h, 0b806h, 0b806h, 0b806h, 0b806h + WORD 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h + WORD 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h, 0745h + WORD 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h + WORD 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h, 8645h + WORD 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h + WORD 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h, 05c2h + WORD 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h + WORD 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h, 2bc2h + WORD 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h + WORD 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h, 0c37h + WORD 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h + WORD 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h, 4137h + WORD 067bh, 067bh, 0c25h, 0c25h, 04a3h, 04a3h, 036ah, 036ah + WORD 0537h, 0537h, 0088h, 0088h, 083fh, 083fh, 04bfh, 04bfh + WORD 0c77bh, 0c77bh, 2b25h, 2b25h, 0bda3h, 0bda3h, 0a16ah, 0a16ah + WORD 3a37h, 3a37h, 1888h, 1888h, 0d53fh, 0d53fh, 51bfh, 51bfh + WORD 0b81h, 0b81h, 0505h, 0505h, 05b9h, 05b9h, 07d7h, 07d7h + WORD 0a9fh, 0a9fh, 08b8h, 08b8h, 0aa6h, 0aa6h, 09d0h, 09d0h + WORD 7e81h, 7e81h, 0c405h, 0c405h, 0a0b9h, 0a0b9h, 1cd7h, 1cd7h + WORD 0f79fh, 0f79fh, 0b0b8h, 0b0b8h, 9ca6h, 9ca6h, 79d0h, 79d0h + WORD 03b7h, 03b7h, 03b7h, 03b7h, 00f7h, 00f7h, 00f7h, 00f7h + WORD 058dh, 058dh, 058dh, 058dh, 0c96h, 0c96h, 0c96h, 0c96h + WORD 0b8b7h, 0b8b7h, 0b8b7h, 0b8b7h, 75f7h, 75f7h, 75f7h, 75f7h + WORD 0dc8dh, 0dc8dh, 0dc8dh, 0dc8dh, 6e96h, 6e96h, 6e96h, 6e96h + WORD 09c3h, 09c3h, 09c3h, 09c3h, 010fh, 010fh, 010fh, 010fh + WORD 005ah, 005ah, 005ah, 005ah, 0355h, 0355h, 0355h, 0355h + WORD 22c3h, 22c3h, 22c3h, 22c3h, 3e0fh, 3e0fh, 3e0fh, 3e0fh + WORD 6e5ah, 6e5ah, 6e5ah, 6e5ah, 0b255h, 0b255h, 0b255h, 0b255h + WORD 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h, 0aa1h + WORD 0a25h, 0a25h, 0a25h, 0a25h, 0a25h, 0a25h, 0a25h, 0a25h + WORD 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h, 0dda1h + WORD 2925h, 2925h, 2925h, 2925h, 2925h, 2925h, 2925h, 2925h + WORD 0908h, 0908h, 0908h, 0908h, 0908h, 0908h, 0908h, 0908h + WORD 02a9h, 02a9h, 02a9h, 02a9h, 02a9h, 02a9h, 02a9h, 02a9h + WORD 0a108h, 0a108h, 0a108h, 0a108h, 0a108h, 0a108h, 0a108h, 0a108h + WORD 6da9h, 6da9h, 6da9h, 6da9h, 6da9h, 6da9h, 6da9h, 6da9h + WORD 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h + WORD 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h, 04b2h + WORD 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h + WORD 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h, 0fab2h + WORD 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh + WORD 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh, 093fh + WORD 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh + WORD 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh, 0d63fh + WORD 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h + WORD 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h, 0be2h + WORD 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h + WORD 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h, 91e2h + WORD 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh + WORD 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh, 05edh + WORD 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh + WORD 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh, 0fcedh + WORD 004bh, 004bh, 0bb8h, 0bb8h, 009ch, 009ch, 0b5fh, 0b5fh + WORD 0ba4h, 0ba4h, 0a7dh, 0a7dh, 0368h, 0368h, 0636h, 0636h + WORD 314bh, 314bh, 0b3b8h, 0b3b8h, 149ch, 149ch, 385fh, 385fh + WORD 0b7a4h, 0b7a4h, 0b17dh, 0b17dh, 0bb68h, 0bb68h, 4836h, 4836h + WORD 08a2h, 08a2h, 0736h, 0736h, 025ah, 025ah, 0309h, 0309h + WORD 0093h, 0093h, 09f7h, 09f7h, 087ah, 087ah, 00f6h, 00f6h + WORD 0cea2h, 0cea2h, 4936h, 4936h, 705ah, 705ah, 8e09h, 8e09h + WORD 8993h, 8993h, 7ef7h, 7ef7h, 0d67ah, 0d67ah, 82f6h, 82f6h + WORD 0744h, 0744h, 0744h, 0744h, 0c83h, 0c83h, 0c83h, 0c83h + WORD 048ah, 048ah, 048ah, 048ah, 0652h, 0652h, 0652h, 0652h + WORD 9344h, 9344h, 9344h, 9344h, 6583h, 6583h, 6583h, 6583h + WORD 028ah, 028ah, 028ah, 028ah, 0dc52h, 0dc52h, 0dc52h, 0dc52h + WORD 029ah, 029ah, 029ah, 029ah, 0140h, 0140h, 0140h, 0140h + WORD 0008h, 0008h, 0008h, 0008h, 0afdh, 0afdh, 0afdh, 0afdh + WORD 309ah, 309ah, 309ah, 309ah, 0c140h, 0c140h, 0c140h, 0c140h + WORD 9808h, 9808h, 9808h, 9808h, 31fdh, 31fdh, 31fdh, 31fdh + WORD 0082h, 0082h, 0082h, 0082h, 0082h, 0082h, 0082h, 0082h + WORD 0642h, 0642h, 0642h, 0642h, 0642h, 0642h, 0642h, 0642h + WORD 6682h, 6682h, 6682h, 6682h, 6682h, 6682h, 6682h, 6682h + WORD 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h, 0ac42h + WORD 074fh, 074fh, 074fh, 074fh, 074fh, 074fh, 074fh, 074fh + WORD 033dh, 033dh, 033dh, 033dh, 033dh, 033dh, 033dh, 033dh + WORD 044fh, 044fh, 044fh, 044fh, 044fh, 044fh, 044fh, 044fh + WORD 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh, 0ea3dh + WORD 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh + WORD 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh, 0c4bh + WORD 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh + WORD 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh, 3d4bh + WORD 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h + WORD 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h, 06d8h + WORD 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h + WORD 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h, 0ed8h + WORD 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h + WORD 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h, 0773h + WORD 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h + WORD 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h, 3073h + WORD 068ch, 068ch, 01cch, 01cch, 06dbh, 06dbh, 0123h, 0123h + WORD 00ebh, 00ebh, 0ab6h, 0ab6h, 0c50h, 0c50h, 0b5bh, 0b5bh + WORD 0ea8ch, 0ea8ch, 0a5cch, 0a5cch, 0e7dbh, 0e7dbh, 3a23h, 3a23h + WORD 11ebh, 11ebh, 0ccb6h, 0ccb6h, 0fc50h, 0fc50h, 6c5bh, 6c5bh + WORD 0c98h, 0c98h, 099ah, 099ah, 06f3h, 06f3h, 04e3h, 04e3h + WORD 09b6h, 09b6h, 0b53h, 0b53h, 0ad6h, 0ad6h, 044fh, 044fh + WORD 5498h, 5498h, 379ah, 379ah, 0aff3h, 0aff3h, 7de3h, 7de3h + WORD 0cbb6h, 0cbb6h, 0d453h, 0d453h, 2cd6h, 2cd6h, 014fh, 014fh + WORD 0608h, 0608h, 0608h, 0608h, 011ah, 011ah, 011ah, 011ah + WORD 072eh, 072eh, 072eh, 072eh, 050dh, 050dh, 050dh, 050dh + WORD 9e08h, 9e08h, 9e08h, 9e08h, 0af1ah, 0af1ah, 0af1ah, 0af1ah + WORD 0b12eh, 0b12eh, 0b12eh, 0b12eh, 5c0dh, 5c0dh, 5c0dh, 5c0dh + WORD 090ah, 090ah, 090ah, 090ah, 0228h, 0228h, 0228h, 0228h + WORD 0a75h, 0a75h, 0a75h, 0a75h, 083ah, 083ah, 083ah, 083ah + WORD 870ah, 870ah, 870ah, 870ah, 0fa28h, 0fa28h, 0fa28h, 0fa28h + WORD 1975h, 1975h, 1975h, 1975h, 163ah, 163ah, 163ah, 163ah + WORD 0b82h, 0b82h, 0b82h, 0b82h, 0b82h, 0b82h, 0b82h, 0b82h + WORD 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h, 0bf9h + WORD 7182h, 7182h, 7182h, 7182h, 7182h, 7182h, 7182h, 7182h + WORD 66f9h, 66f9h, 66f9h, 66f9h, 66f9h, 66f9h, 66f9h, 66f9h + WORD 052dh, 052dh, 052dh, 052dh, 052dh, 052dh, 052dh, 052dh + WORD 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h, 0ac4h + WORD 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh, 0bc2dh + WORD 16c4h, 16c4h, 16c4h, 16c4h, 16c4h, 16c4h, 16c4h, 16c4h + WORD 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h + WORD 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h, 0a93h + WORD 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h + WORD 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h, 9393h + WORD 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh + WORD 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh, 00abh + WORD 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh + WORD 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh, 51abh + WORD 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch + WORD 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch, 072ch + WORD 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch + WORD 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch, 0cb2ch + WORD 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h + WORD 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h, 0167h + WORD 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h + WORD 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h, 0c667h + WORD 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h + WORD 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h, 02f6h + WORD 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h + WORD 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h, 84f6h + WORD 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h + WORD 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h, 05a1h + WORD 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h + WORD 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h, 0d8a1h +ptr_L_mlkem_avx2_zetas_inv QWORD L_mlkem_avx2_zetas_inv +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_keygen_avx2 PROC + push r12 + push r13 + push r14 + mov rax, QWORD PTR [rsp+64] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm14, YMMWORD PTR mlkem_q + vmovdqu ymm15, YMMWORD PTR mlkem_v + mov r13, rcx + movsxd r11, eax + mov r12, rcx +L_mlkem_keygen_avx2_priv: + ; ntt + mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas] + vmovdqu ymm10, YMMWORD PTR [r14] + vmovdqu ymm12, YMMWORD PTR [r14+32] + vmovdqu ymm0, YMMWORD PTR [r12+128] + vmovdqu ymm1, YMMWORD PTR [r12+160] + vmovdqu ymm2, YMMWORD PTR [r12+192] + vmovdqu ymm3, YMMWORD PTR [r12+224] + vmovdqu ymm4, YMMWORD PTR [r12+384] + vmovdqu ymm5, YMMWORD PTR [r12+416] + vmovdqu ymm6, YMMWORD PTR [r12+448] + vmovdqu ymm7, YMMWORD PTR [r12+480] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [r12+128], ymm0 + vmovdqu YMMWORD PTR [r12+160], ymm1 + vmovdqu YMMWORD PTR [r12+192], ymm2 + vmovdqu YMMWORD PTR [r12+224], ymm3 + vmovdqu YMMWORD PTR [r12+384], ymm4 + vmovdqu YMMWORD PTR [r12+416], ymm5 + vmovdqu YMMWORD PTR [r12+448], ymm6 + vmovdqu YMMWORD PTR [r12+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [r12] + vmovdqu ymm1, YMMWORD PTR [r12+32] + vmovdqu ymm2, YMMWORD PTR [r12+64] + vmovdqu ymm3, YMMWORD PTR [r12+96] + vmovdqu ymm4, YMMWORD PTR [r12+256] + vmovdqu ymm5, YMMWORD PTR [r12+288] + vmovdqu ymm6, YMMWORD PTR [r12+320] + vmovdqu ymm7, YMMWORD PTR [r12+352] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [r12+256], ymm4 + vmovdqu YMMWORD PTR [r12+288], ymm5 + vmovdqu YMMWORD PTR [r12+320], ymm6 + vmovdqu YMMWORD PTR [r12+352], ymm7 + vmovdqu ymm4, YMMWORD PTR [r12+128] + vmovdqu ymm5, YMMWORD PTR [r12+160] + vmovdqu ymm6, YMMWORD PTR [r12+192] + vmovdqu ymm7, YMMWORD PTR [r12+224] + ; 64: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+64] + vmovdqu ymm12, YMMWORD PTR [r14+96] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+128] + vmovdqu ymm12, YMMWORD PTR [r14+160] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+192] + vmovdqu ymm12, YMMWORD PTR [r14+224] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+256] + vmovdqu ymm12, YMMWORD PTR [r14+288] + vmovdqu ymm11, YMMWORD PTR [r14+320] + vmovdqu ymm13, YMMWORD PTR [r14+352] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+384] + vmovdqu ymm12, YMMWORD PTR [r14+416] + vmovdqu ymm11, YMMWORD PTR [r14+448] + vmovdqu ymm13, YMMWORD PTR [r14+480] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r14+512] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r14+544] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r14+576] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r14+608] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+640] + vmovdqu ymm12, YMMWORD PTR [r14+672] + vmovdqu ymm11, YMMWORD PTR [r14+704] + vmovdqu ymm13, YMMWORD PTR [r14+736] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r14+768] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r14+800] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r14+832] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r14+864] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+896] + vmovdqu ymm12, YMMWORD PTR [r14+928] + vmovdqu ymm11, YMMWORD PTR [r14+960] + vmovdqu ymm13, YMMWORD PTR [r14+992] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+1024] + vmovdqu ymm12, YMMWORD PTR [r14+1056] + vmovdqu ymm11, YMMWORD PTR [r14+1088] + vmovdqu ymm13, YMMWORD PTR [r14+1120] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+1152] + vmovdqu ymm12, YMMWORD PTR [r14+1184] + vmovdqu ymm11, YMMWORD PTR [r14+1216] + vmovdqu ymm13, YMMWORD PTR [r14+1248] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vpmulhw ymm8, ymm0, ymm15 + vpmulhw ymm9, ymm1, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm0, ymm8 + vpsubw ymm9, ymm1, ymm9 + vmovdqu YMMWORD PTR [r12], ymm8 + vmovdqu YMMWORD PTR [r12+32], ymm9 + vpmulhw ymm8, ymm2, ymm15 + vpmulhw ymm9, ymm3, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vmovdqu YMMWORD PTR [r12+64], ymm8 + vmovdqu YMMWORD PTR [r12+96], ymm9 + vpmulhw ymm8, ymm4, ymm15 + vpmulhw ymm9, ymm5, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vmovdqu YMMWORD PTR [r12+128], ymm8 + vmovdqu YMMWORD PTR [r12+160], ymm9 + vpmulhw ymm8, ymm6, ymm15 + vpmulhw ymm9, ymm7, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vmovdqu YMMWORD PTR [r12+192], ymm8 + vmovdqu YMMWORD PTR [r12+224], ymm9 + vmovdqu ymm0, YMMWORD PTR [r12+256] + vmovdqu ymm1, YMMWORD PTR [r12+288] + vmovdqu ymm2, YMMWORD PTR [r12+320] + vmovdqu ymm3, YMMWORD PTR [r12+352] + vmovdqu ymm4, YMMWORD PTR [r12+384] + vmovdqu ymm5, YMMWORD PTR [r12+416] + vmovdqu ymm6, YMMWORD PTR [r12+448] + vmovdqu ymm7, YMMWORD PTR [r12+480] + ; 64: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1280] + vmovdqu ymm12, YMMWORD PTR [r14+1312] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1344] + vmovdqu ymm12, YMMWORD PTR [r14+1376] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1408] + vmovdqu ymm12, YMMWORD PTR [r14+1440] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1472] + vmovdqu ymm12, YMMWORD PTR [r14+1504] + vmovdqu ymm11, YMMWORD PTR [r14+1536] + vmovdqu ymm13, YMMWORD PTR [r14+1568] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1600] + vmovdqu ymm12, YMMWORD PTR [r14+1632] + vmovdqu ymm11, YMMWORD PTR [r14+1664] + vmovdqu ymm13, YMMWORD PTR [r14+1696] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r14+1728] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r14+1760] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r14+1792] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r14+1824] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1856] + vmovdqu ymm12, YMMWORD PTR [r14+1888] + vmovdqu ymm11, YMMWORD PTR [r14+1920] + vmovdqu ymm13, YMMWORD PTR [r14+1952] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r14+1984] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r14+2016] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r14+2048] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r14+2080] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+2112] + vmovdqu ymm12, YMMWORD PTR [r14+2144] + vmovdqu ymm11, YMMWORD PTR [r14+2176] + vmovdqu ymm13, YMMWORD PTR [r14+2208] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+2240] + vmovdqu ymm12, YMMWORD PTR [r14+2272] + vmovdqu ymm11, YMMWORD PTR [r14+2304] + vmovdqu ymm13, YMMWORD PTR [r14+2336] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+2368] + vmovdqu ymm12, YMMWORD PTR [r14+2400] + vmovdqu ymm11, YMMWORD PTR [r14+2432] + vmovdqu ymm13, YMMWORD PTR [r14+2464] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vpmulhw ymm8, ymm0, ymm15 + vpmulhw ymm9, ymm1, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm0, ymm8 + vpsubw ymm9, ymm1, ymm9 + vmovdqu YMMWORD PTR [r12+256], ymm8 + vmovdqu YMMWORD PTR [r12+288], ymm9 + vpmulhw ymm8, ymm2, ymm15 + vpmulhw ymm9, ymm3, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vmovdqu YMMWORD PTR [r12+320], ymm8 + vmovdqu YMMWORD PTR [r12+352], ymm9 + vpmulhw ymm8, ymm4, ymm15 + vpmulhw ymm9, ymm5, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vmovdqu YMMWORD PTR [r12+384], ymm8 + vmovdqu YMMWORD PTR [r12+416], ymm9 + vpmulhw ymm8, ymm6, ymm15 + vpmulhw ymm9, ymm7, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vmovdqu YMMWORD PTR [r12+448], ymm8 + vmovdqu YMMWORD PTR [r12+480], ymm9 + add r12, 512 + sub r11, 1 + jg L_mlkem_keygen_avx2_priv + vmovdqu ymm13, YMMWORD PTR mlkem_qinv + movsxd r10, eax + mov r12, rdx +L_mlkem_keygen_avx2_acc: + ; Pointwise acc mont + movsxd r11, eax + ; Base mul mont + mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [r9] + vmovdqu ymm3, YMMWORD PTR [r9+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx] + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14] + vmovdqu ymm11, YMMWORD PTR [r14+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12], ymm0 + vmovdqu YMMWORD PTR [r12+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+64] + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+64] + vmovdqu ymm11, YMMWORD PTR [r14+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12+64], ymm0 + vmovdqu YMMWORD PTR [r12+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+128] + vmovdqu ymm3, YMMWORD PTR [r9+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+128] + vmovdqu ymm11, YMMWORD PTR [r14+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12+128], ymm0 + vmovdqu YMMWORD PTR [r12+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+192] + vmovdqu ymm3, YMMWORD PTR [r9+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+192] + vmovdqu ymm5, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+192] + vmovdqu ymm11, YMMWORD PTR [r14+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12+192], ymm0 + vmovdqu YMMWORD PTR [r12+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+256] + vmovdqu ymm3, YMMWORD PTR [r9+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+256] + vmovdqu ymm5, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+256] + vmovdqu ymm11, YMMWORD PTR [r14+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12+256], ymm0 + vmovdqu YMMWORD PTR [r12+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+320] + vmovdqu ymm3, YMMWORD PTR [r9+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+320] + vmovdqu ymm5, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+320] + vmovdqu ymm11, YMMWORD PTR [r14+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12+320], ymm0 + vmovdqu YMMWORD PTR [r12+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+384] + vmovdqu ymm3, YMMWORD PTR [r9+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+384] + vmovdqu ymm11, YMMWORD PTR [r14+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12+384], ymm0 + vmovdqu YMMWORD PTR [r12+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+448] + vmovdqu ymm3, YMMWORD PTR [r9+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+448] + vmovdqu ymm5, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+448] + vmovdqu ymm11, YMMWORD PTR [r14+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r12+448], ymm0 + vmovdqu YMMWORD PTR [r12+480], ymm1 + add r9, 512 + add rcx, 512 + sub r11, 2 + jz L_pointwise_acc_mont_end_keygen +L_pointwise_acc_mont_start_keygen: + ; Base mul mont add + mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [r9] + vmovdqu ymm3, YMMWORD PTR [r9+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx] + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14] + vmovdqu ymm11, YMMWORD PTR [r14+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12] + vmovdqu ymm7, YMMWORD PTR [r12+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12], ymm0 + vmovdqu YMMWORD PTR [r12+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+64] + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+64] + vmovdqu ymm11, YMMWORD PTR [r14+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+64] + vmovdqu ymm7, YMMWORD PTR [r12+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12+64], ymm0 + vmovdqu YMMWORD PTR [r12+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+128] + vmovdqu ymm3, YMMWORD PTR [r9+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+128] + vmovdqu ymm11, YMMWORD PTR [r14+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+128] + vmovdqu ymm7, YMMWORD PTR [r12+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12+128], ymm0 + vmovdqu YMMWORD PTR [r12+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+192] + vmovdqu ymm3, YMMWORD PTR [r9+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+192] + vmovdqu ymm5, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+192] + vmovdqu ymm11, YMMWORD PTR [r14+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+192] + vmovdqu ymm7, YMMWORD PTR [r12+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12+192], ymm0 + vmovdqu YMMWORD PTR [r12+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+256] + vmovdqu ymm3, YMMWORD PTR [r9+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+256] + vmovdqu ymm5, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+256] + vmovdqu ymm11, YMMWORD PTR [r14+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+256] + vmovdqu ymm7, YMMWORD PTR [r12+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12+256], ymm0 + vmovdqu YMMWORD PTR [r12+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+320] + vmovdqu ymm3, YMMWORD PTR [r9+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+320] + vmovdqu ymm5, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+320] + vmovdqu ymm11, YMMWORD PTR [r14+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+320] + vmovdqu ymm7, YMMWORD PTR [r12+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12+320], ymm0 + vmovdqu YMMWORD PTR [r12+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+384] + vmovdqu ymm3, YMMWORD PTR [r9+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+384] + vmovdqu ymm11, YMMWORD PTR [r14+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+384] + vmovdqu ymm7, YMMWORD PTR [r12+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12+384], ymm0 + vmovdqu YMMWORD PTR [r12+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+448] + vmovdqu ymm3, YMMWORD PTR [r9+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+448] + vmovdqu ymm5, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+448] + vmovdqu ymm11, YMMWORD PTR [r14+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+448] + vmovdqu ymm7, YMMWORD PTR [r12+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r12+448], ymm0 + vmovdqu YMMWORD PTR [r12+480], ymm1 + add r9, 512 + add rcx, 512 + sub r11, 1 + jg L_pointwise_acc_mont_start_keygen +L_pointwise_acc_mont_end_keygen: + ; Base mul mont add + mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [r9] + vmovdqu ymm3, YMMWORD PTR [r9+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx] + vmovdqu ymm5, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14] + vmovdqu ymm11, YMMWORD PTR [r14+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12] + vmovdqu ymm7, YMMWORD PTR [r12+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12], ymm0 + vmovdqu YMMWORD PTR [r12+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+64] + vmovdqu ymm5, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+64] + vmovdqu ymm11, YMMWORD PTR [r14+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+64] + vmovdqu ymm7, YMMWORD PTR [r12+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12+64], ymm0 + vmovdqu YMMWORD PTR [r12+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+128] + vmovdqu ymm3, YMMWORD PTR [r9+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+128] + vmovdqu ymm11, YMMWORD PTR [r14+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+128] + vmovdqu ymm7, YMMWORD PTR [r12+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12+128], ymm0 + vmovdqu YMMWORD PTR [r12+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+192] + vmovdqu ymm3, YMMWORD PTR [r9+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+192] + vmovdqu ymm5, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+192] + vmovdqu ymm11, YMMWORD PTR [r14+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+192] + vmovdqu ymm7, YMMWORD PTR [r12+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12+192], ymm0 + vmovdqu YMMWORD PTR [r12+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+256] + vmovdqu ymm3, YMMWORD PTR [r9+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+256] + vmovdqu ymm5, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+256] + vmovdqu ymm11, YMMWORD PTR [r14+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+256] + vmovdqu ymm7, YMMWORD PTR [r12+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12+256], ymm0 + vmovdqu YMMWORD PTR [r12+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+320] + vmovdqu ymm3, YMMWORD PTR [r9+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+320] + vmovdqu ymm5, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+320] + vmovdqu ymm11, YMMWORD PTR [r14+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+320] + vmovdqu ymm7, YMMWORD PTR [r12+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12+320], ymm0 + vmovdqu YMMWORD PTR [r12+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+384] + vmovdqu ymm3, YMMWORD PTR [r9+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+384] + vmovdqu ymm11, YMMWORD PTR [r14+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+384] + vmovdqu ymm7, YMMWORD PTR [r12+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12+384], ymm0 + vmovdqu YMMWORD PTR [r12+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+448] + vmovdqu ymm3, YMMWORD PTR [r9+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rcx+448] + vmovdqu ymm5, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r14+448] + vmovdqu ymm11, YMMWORD PTR [r14+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm13 + vpmullw ymm9, ymm2, ymm13 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r12+448] + vmovdqu ymm7, YMMWORD PTR [r12+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r12+448], ymm0 + vmovdqu YMMWORD PTR [r12+480], ymm1 + add r9, 512 + mov rcx, r13 + add r12, 512 + sub r10, 1 + jg L_mlkem_keygen_avx2_acc + movsxd r10, eax + vmovdqu ymm12, YMMWORD PTR mlkem_f + vmovdqu ymm13, YMMWORD PTR mlkem_f_qinv + movsxd r10, eax + mov r12, rdx +L_mlkem_keygen_avx2_to_mont: + ; To Mont + vmovdqu ymm0, YMMWORD PTR [r12] + vmovdqu ymm1, YMMWORD PTR [r12+32] + vmovdqu ymm2, YMMWORD PTR [r12+64] + vmovdqu ymm3, YMMWORD PTR [r12+96] + vpmullw ymm4, ymm0, ymm13 + vpmulhw ymm5, ymm0, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm0, ymm5, ymm4 + vpmullw ymm4, ymm1, ymm13 + vpmulhw ymm5, ymm1, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm1, ymm5, ymm4 + vpmullw ymm4, ymm2, ymm13 + vpmulhw ymm5, ymm2, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm2, ymm5, ymm4 + vpmullw ymm4, ymm3, ymm13 + vpmulhw ymm5, ymm3, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm3, ymm5, ymm4 + vmovdqu YMMWORD PTR [r12], ymm0 + vmovdqu YMMWORD PTR [r12+32], ymm1 + vmovdqu YMMWORD PTR [r12+64], ymm2 + vmovdqu YMMWORD PTR [r12+96], ymm3 + vmovdqu ymm0, YMMWORD PTR [r12+128] + vmovdqu ymm1, YMMWORD PTR [r12+160] + vmovdqu ymm2, YMMWORD PTR [r12+192] + vmovdqu ymm3, YMMWORD PTR [r12+224] + vpmullw ymm4, ymm0, ymm13 + vpmulhw ymm5, ymm0, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm0, ymm5, ymm4 + vpmullw ymm4, ymm1, ymm13 + vpmulhw ymm5, ymm1, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm1, ymm5, ymm4 + vpmullw ymm4, ymm2, ymm13 + vpmulhw ymm5, ymm2, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm2, ymm5, ymm4 + vpmullw ymm4, ymm3, ymm13 + vpmulhw ymm5, ymm3, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm3, ymm5, ymm4 + vmovdqu YMMWORD PTR [r12+128], ymm0 + vmovdqu YMMWORD PTR [r12+160], ymm1 + vmovdqu YMMWORD PTR [r12+192], ymm2 + vmovdqu YMMWORD PTR [r12+224], ymm3 + vmovdqu ymm0, YMMWORD PTR [r12+256] + vmovdqu ymm1, YMMWORD PTR [r12+288] + vmovdqu ymm2, YMMWORD PTR [r12+320] + vmovdqu ymm3, YMMWORD PTR [r12+352] + vpmullw ymm4, ymm0, ymm13 + vpmulhw ymm5, ymm0, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm0, ymm5, ymm4 + vpmullw ymm4, ymm1, ymm13 + vpmulhw ymm5, ymm1, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm1, ymm5, ymm4 + vpmullw ymm4, ymm2, ymm13 + vpmulhw ymm5, ymm2, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm2, ymm5, ymm4 + vpmullw ymm4, ymm3, ymm13 + vpmulhw ymm5, ymm3, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm3, ymm5, ymm4 + vmovdqu YMMWORD PTR [r12+256], ymm0 + vmovdqu YMMWORD PTR [r12+288], ymm1 + vmovdqu YMMWORD PTR [r12+320], ymm2 + vmovdqu YMMWORD PTR [r12+352], ymm3 + vmovdqu ymm0, YMMWORD PTR [r12+384] + vmovdqu ymm1, YMMWORD PTR [r12+416] + vmovdqu ymm2, YMMWORD PTR [r12+448] + vmovdqu ymm3, YMMWORD PTR [r12+480] + vpmullw ymm4, ymm0, ymm13 + vpmulhw ymm5, ymm0, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm0, ymm5, ymm4 + vpmullw ymm4, ymm1, ymm13 + vpmulhw ymm5, ymm1, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm1, ymm5, ymm4 + vpmullw ymm4, ymm2, ymm13 + vpmulhw ymm5, ymm2, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm2, ymm5, ymm4 + vpmullw ymm4, ymm3, ymm13 + vpmulhw ymm5, ymm3, ymm12 + vpmulhw ymm4, ymm4, ymm14 + vpsubw ymm3, ymm5, ymm4 + vmovdqu YMMWORD PTR [r12+384], ymm0 + vmovdqu YMMWORD PTR [r12+416], ymm1 + vmovdqu YMMWORD PTR [r12+448], ymm2 + vmovdqu YMMWORD PTR [r12+480], ymm3 + add r12, 512 + sub r10, 1 + jg L_mlkem_keygen_avx2_to_mont + movsxd r10, eax +L_mlkem_keygen_avx2_to_mont_ntt_err: + ; ntt + mov r14, QWORD PTR [ptr_L_mlkem_avx2_zetas] + vmovdqu ymm10, YMMWORD PTR [r14] + vmovdqu ymm12, YMMWORD PTR [r14+32] + vmovdqu ymm0, YMMWORD PTR [r8+128] + vmovdqu ymm1, YMMWORD PTR [r8+160] + vmovdqu ymm2, YMMWORD PTR [r8+192] + vmovdqu ymm3, YMMWORD PTR [r8+224] + vmovdqu ymm4, YMMWORD PTR [r8+384] + vmovdqu ymm5, YMMWORD PTR [r8+416] + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm7, YMMWORD PTR [r8+480] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+160], ymm1 + vmovdqu YMMWORD PTR [r8+192], ymm2 + vmovdqu YMMWORD PTR [r8+224], ymm3 + vmovdqu YMMWORD PTR [r8+384], ymm4 + vmovdqu YMMWORD PTR [r8+416], ymm5 + vmovdqu YMMWORD PTR [r8+448], ymm6 + vmovdqu YMMWORD PTR [r8+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [r8] + vmovdqu ymm1, YMMWORD PTR [r8+32] + vmovdqu ymm2, YMMWORD PTR [r8+64] + vmovdqu ymm3, YMMWORD PTR [r8+96] + vmovdqu ymm4, YMMWORD PTR [r8+256] + vmovdqu ymm5, YMMWORD PTR [r8+288] + vmovdqu ymm6, YMMWORD PTR [r8+320] + vmovdqu ymm7, YMMWORD PTR [r8+352] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [r8+256], ymm4 + vmovdqu YMMWORD PTR [r8+288], ymm5 + vmovdqu YMMWORD PTR [r8+320], ymm6 + vmovdqu YMMWORD PTR [r8+352], ymm7 + vmovdqu ymm4, YMMWORD PTR [r8+128] + vmovdqu ymm5, YMMWORD PTR [r8+160] + vmovdqu ymm6, YMMWORD PTR [r8+192] + vmovdqu ymm7, YMMWORD PTR [r8+224] + ; 64: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+64] + vmovdqu ymm12, YMMWORD PTR [r14+96] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+128] + vmovdqu ymm12, YMMWORD PTR [r14+160] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+192] + vmovdqu ymm12, YMMWORD PTR [r14+224] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+256] + vmovdqu ymm12, YMMWORD PTR [r14+288] + vmovdqu ymm11, YMMWORD PTR [r14+320] + vmovdqu ymm13, YMMWORD PTR [r14+352] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+384] + vmovdqu ymm12, YMMWORD PTR [r14+416] + vmovdqu ymm11, YMMWORD PTR [r14+448] + vmovdqu ymm13, YMMWORD PTR [r14+480] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r14+512] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r14+544] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r14+576] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r14+608] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+640] + vmovdqu ymm12, YMMWORD PTR [r14+672] + vmovdqu ymm11, YMMWORD PTR [r14+704] + vmovdqu ymm13, YMMWORD PTR [r14+736] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r14+768] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r14+800] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r14+832] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r14+864] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+896] + vmovdqu ymm12, YMMWORD PTR [r14+928] + vmovdqu ymm11, YMMWORD PTR [r14+960] + vmovdqu ymm13, YMMWORD PTR [r14+992] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+1024] + vmovdqu ymm12, YMMWORD PTR [r14+1056] + vmovdqu ymm11, YMMWORD PTR [r14+1088] + vmovdqu ymm13, YMMWORD PTR [r14+1120] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [r14+1152] + vmovdqu ymm12, YMMWORD PTR [r14+1184] + vmovdqu ymm11, YMMWORD PTR [r14+1216] + vmovdqu ymm13, YMMWORD PTR [r14+1248] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu ymm8, YMMWORD PTR [rdx] + vmovdqu ymm9, YMMWORD PTR [rdx+32] + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmulhw ymm8, ymm0, ymm15 + vpmulhw ymm9, ymm1, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm0, ymm8 + vpsubw ymm9, ymm1, ymm9 + vmovdqu YMMWORD PTR [rdx], ymm8 + vmovdqu YMMWORD PTR [rdx+32], ymm9 + vmovdqu ymm8, YMMWORD PTR [rdx+64] + vmovdqu ymm9, YMMWORD PTR [rdx+96] + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vpmulhw ymm8, ymm2, ymm15 + vpmulhw ymm9, ymm3, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vmovdqu YMMWORD PTR [rdx+64], ymm8 + vmovdqu YMMWORD PTR [rdx+96], ymm9 + vmovdqu ymm8, YMMWORD PTR [rdx+128] + vmovdqu ymm9, YMMWORD PTR [rdx+160] + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + vpmulhw ymm8, ymm4, ymm15 + vpmulhw ymm9, ymm5, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vmovdqu YMMWORD PTR [rdx+128], ymm8 + vmovdqu YMMWORD PTR [rdx+160], ymm9 + vmovdqu ymm8, YMMWORD PTR [rdx+192] + vmovdqu ymm9, YMMWORD PTR [rdx+224] + vpaddw ymm6, ymm6, ymm8 + vpaddw ymm7, ymm7, ymm9 + vpmulhw ymm8, ymm6, ymm15 + vpmulhw ymm9, ymm7, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vmovdqu YMMWORD PTR [rdx+192], ymm8 + vmovdqu YMMWORD PTR [rdx+224], ymm9 + vmovdqu ymm0, YMMWORD PTR [r8+256] + vmovdqu ymm1, YMMWORD PTR [r8+288] + vmovdqu ymm2, YMMWORD PTR [r8+320] + vmovdqu ymm3, YMMWORD PTR [r8+352] + vmovdqu ymm4, YMMWORD PTR [r8+384] + vmovdqu ymm5, YMMWORD PTR [r8+416] + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm7, YMMWORD PTR [r8+480] + ; 64: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1280] + vmovdqu ymm12, YMMWORD PTR [r14+1312] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1344] + vmovdqu ymm12, YMMWORD PTR [r14+1376] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1408] + vmovdqu ymm12, YMMWORD PTR [r14+1440] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1472] + vmovdqu ymm12, YMMWORD PTR [r14+1504] + vmovdqu ymm11, YMMWORD PTR [r14+1536] + vmovdqu ymm13, YMMWORD PTR [r14+1568] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1600] + vmovdqu ymm12, YMMWORD PTR [r14+1632] + vmovdqu ymm11, YMMWORD PTR [r14+1664] + vmovdqu ymm13, YMMWORD PTR [r14+1696] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r14+1728] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r14+1760] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r14+1792] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r14+1824] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+1856] + vmovdqu ymm12, YMMWORD PTR [r14+1888] + vmovdqu ymm11, YMMWORD PTR [r14+1920] + vmovdqu ymm13, YMMWORD PTR [r14+1952] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r14+1984] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r14+2016] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r14+2048] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r14+2080] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+2112] + vmovdqu ymm12, YMMWORD PTR [r14+2144] + vmovdqu ymm11, YMMWORD PTR [r14+2176] + vmovdqu ymm13, YMMWORD PTR [r14+2208] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+2240] + vmovdqu ymm12, YMMWORD PTR [r14+2272] + vmovdqu ymm11, YMMWORD PTR [r14+2304] + vmovdqu ymm13, YMMWORD PTR [r14+2336] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [r14+2368] + vmovdqu ymm12, YMMWORD PTR [r14+2400] + vmovdqu ymm11, YMMWORD PTR [r14+2432] + vmovdqu ymm13, YMMWORD PTR [r14+2464] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu ymm8, YMMWORD PTR [rdx+256] + vmovdqu ymm9, YMMWORD PTR [rdx+288] + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmulhw ymm8, ymm0, ymm15 + vpmulhw ymm9, ymm1, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm0, ymm8 + vpsubw ymm9, ymm1, ymm9 + vmovdqu YMMWORD PTR [rdx+256], ymm8 + vmovdqu YMMWORD PTR [rdx+288], ymm9 + vmovdqu ymm8, YMMWORD PTR [rdx+320] + vmovdqu ymm9, YMMWORD PTR [rdx+352] + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vpmulhw ymm8, ymm2, ymm15 + vpmulhw ymm9, ymm3, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vmovdqu YMMWORD PTR [rdx+320], ymm8 + vmovdqu YMMWORD PTR [rdx+352], ymm9 + vmovdqu ymm8, YMMWORD PTR [rdx+384] + vmovdqu ymm9, YMMWORD PTR [rdx+416] + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + vpmulhw ymm8, ymm4, ymm15 + vpmulhw ymm9, ymm5, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vmovdqu YMMWORD PTR [rdx+384], ymm8 + vmovdqu YMMWORD PTR [rdx+416], ymm9 + vmovdqu ymm8, YMMWORD PTR [rdx+448] + vmovdqu ymm9, YMMWORD PTR [rdx+480] + vpaddw ymm6, ymm6, ymm8 + vpaddw ymm7, ymm7, ymm9 + vpmulhw ymm8, ymm6, ymm15 + vpmulhw ymm9, ymm7, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vmovdqu YMMWORD PTR [rdx+448], ymm8 + vmovdqu YMMWORD PTR [rdx+480], ymm9 + add r8, 512 + add rdx, 512 + sub r10, 1 + jg L_mlkem_keygen_avx2_to_mont_ntt_err + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r14 + pop r13 + pop r12 + ret +mlkem_keygen_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_encapsulate_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbx + mov rax, QWORD PTR [rsp+96] + mov r10, QWORD PTR [rsp+104] + mov r11, QWORD PTR [rsp+112] + mov r12, QWORD PTR [rsp+120] + mov r13, QWORD PTR [rsp+128] + sub rsp, 208 + vmovdqu OWORD PTR [rsp+48], xmm6 + vmovdqu OWORD PTR [rsp+64], xmm7 + vmovdqu OWORD PTR [rsp+80], xmm8 + vmovdqu OWORD PTR [rsp+96], xmm9 + vmovdqu OWORD PTR [rsp+112], xmm10 + vmovdqu OWORD PTR [rsp+128], xmm11 + vmovdqu OWORD PTR [rsp+144], xmm12 + vmovdqu OWORD PTR [rsp+160], xmm13 + vmovdqu OWORD PTR [rsp+176], xmm14 + vmovdqu OWORD PTR [rsp+192], xmm15 + vmovdqu ymm14, YMMWORD PTR mlkem_q + vmovdqu ymm15, YMMWORD PTR mlkem_v + mov rsi, rax + movsxd r15, r13d + mov rdi, rax +L_mlkem_encapsulate_avx2_trans: + ; ntt + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas] + vmovdqu ymm10, YMMWORD PTR [rbx] + vmovdqu ymm12, YMMWORD PTR [rbx+32] + vmovdqu ymm0, YMMWORD PTR [rdi+128] + vmovdqu ymm1, YMMWORD PTR [rdi+160] + vmovdqu ymm2, YMMWORD PTR [rdi+192] + vmovdqu ymm3, YMMWORD PTR [rdi+224] + vmovdqu ymm4, YMMWORD PTR [rdi+384] + vmovdqu ymm5, YMMWORD PTR [rdi+416] + vmovdqu ymm6, YMMWORD PTR [rdi+448] + vmovdqu ymm7, YMMWORD PTR [rdi+480] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [rdi+128], ymm0 + vmovdqu YMMWORD PTR [rdi+160], ymm1 + vmovdqu YMMWORD PTR [rdi+192], ymm2 + vmovdqu YMMWORD PTR [rdi+224], ymm3 + vmovdqu YMMWORD PTR [rdi+384], ymm4 + vmovdqu YMMWORD PTR [rdi+416], ymm5 + vmovdqu YMMWORD PTR [rdi+448], ymm6 + vmovdqu YMMWORD PTR [rdi+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdi] + vmovdqu ymm1, YMMWORD PTR [rdi+32] + vmovdqu ymm2, YMMWORD PTR [rdi+64] + vmovdqu ymm3, YMMWORD PTR [rdi+96] + vmovdqu ymm4, YMMWORD PTR [rdi+256] + vmovdqu ymm5, YMMWORD PTR [rdi+288] + vmovdqu ymm6, YMMWORD PTR [rdi+320] + vmovdqu ymm7, YMMWORD PTR [rdi+352] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [rdi+256], ymm4 + vmovdqu YMMWORD PTR [rdi+288], ymm5 + vmovdqu YMMWORD PTR [rdi+320], ymm6 + vmovdqu YMMWORD PTR [rdi+352], ymm7 + vmovdqu ymm4, YMMWORD PTR [rdi+128] + vmovdqu ymm5, YMMWORD PTR [rdi+160] + vmovdqu ymm6, YMMWORD PTR [rdi+192] + vmovdqu ymm7, YMMWORD PTR [rdi+224] + ; 64: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+64] + vmovdqu ymm12, YMMWORD PTR [rbx+96] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm12, YMMWORD PTR [rbx+160] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+192] + vmovdqu ymm12, YMMWORD PTR [rbx+224] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm12, YMMWORD PTR [rbx+288] + vmovdqu ymm11, YMMWORD PTR [rbx+320] + vmovdqu ymm13, YMMWORD PTR [rbx+352] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vmovdqu ymm12, YMMWORD PTR [rbx+416] + vmovdqu ymm11, YMMWORD PTR [rbx+448] + vmovdqu ymm13, YMMWORD PTR [rbx+480] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+512] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+544] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+576] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+608] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+640] + vmovdqu ymm12, YMMWORD PTR [rbx+672] + vmovdqu ymm11, YMMWORD PTR [rbx+704] + vmovdqu ymm13, YMMWORD PTR [rbx+736] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+768] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+800] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+832] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+864] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+896] + vmovdqu ymm12, YMMWORD PTR [rbx+928] + vmovdqu ymm11, YMMWORD PTR [rbx+960] + vmovdqu ymm13, YMMWORD PTR [rbx+992] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1024] + vmovdqu ymm12, YMMWORD PTR [rbx+1056] + vmovdqu ymm11, YMMWORD PTR [rbx+1088] + vmovdqu ymm13, YMMWORD PTR [rbx+1120] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1152] + vmovdqu ymm12, YMMWORD PTR [rbx+1184] + vmovdqu ymm11, YMMWORD PTR [rbx+1216] + vmovdqu ymm13, YMMWORD PTR [rbx+1248] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu YMMWORD PTR [rdi], ymm0 + vmovdqu YMMWORD PTR [rdi+32], ymm1 + vmovdqu YMMWORD PTR [rdi+64], ymm2 + vmovdqu YMMWORD PTR [rdi+96], ymm3 + vmovdqu YMMWORD PTR [rdi+128], ymm4 + vmovdqu YMMWORD PTR [rdi+160], ymm5 + vmovdqu YMMWORD PTR [rdi+192], ymm6 + vmovdqu YMMWORD PTR [rdi+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdi+256] + vmovdqu ymm1, YMMWORD PTR [rdi+288] + vmovdqu ymm2, YMMWORD PTR [rdi+320] + vmovdqu ymm3, YMMWORD PTR [rdi+352] + vmovdqu ymm4, YMMWORD PTR [rdi+384] + vmovdqu ymm5, YMMWORD PTR [rdi+416] + vmovdqu ymm6, YMMWORD PTR [rdi+448] + vmovdqu ymm7, YMMWORD PTR [rdi+480] + ; 64: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1280] + vmovdqu ymm12, YMMWORD PTR [rbx+1312] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1344] + vmovdqu ymm12, YMMWORD PTR [rbx+1376] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1408] + vmovdqu ymm12, YMMWORD PTR [rbx+1440] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1472] + vmovdqu ymm12, YMMWORD PTR [rbx+1504] + vmovdqu ymm11, YMMWORD PTR [rbx+1536] + vmovdqu ymm13, YMMWORD PTR [rbx+1568] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1600] + vmovdqu ymm12, YMMWORD PTR [rbx+1632] + vmovdqu ymm11, YMMWORD PTR [rbx+1664] + vmovdqu ymm13, YMMWORD PTR [rbx+1696] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1728] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+1760] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1792] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1824] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+1856] + vmovdqu ymm12, YMMWORD PTR [rbx+1888] + vmovdqu ymm11, YMMWORD PTR [rbx+1920] + vmovdqu ymm13, YMMWORD PTR [rbx+1952] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1984] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+2016] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+2048] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+2080] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+2112] + vmovdqu ymm12, YMMWORD PTR [rbx+2144] + vmovdqu ymm11, YMMWORD PTR [rbx+2176] + vmovdqu ymm13, YMMWORD PTR [rbx+2208] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+2240] + vmovdqu ymm12, YMMWORD PTR [rbx+2272] + vmovdqu ymm11, YMMWORD PTR [rbx+2304] + vmovdqu ymm13, YMMWORD PTR [rbx+2336] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [rbx+2368] + vmovdqu ymm12, YMMWORD PTR [rbx+2400] + vmovdqu ymm11, YMMWORD PTR [rbx+2432] + vmovdqu ymm13, YMMWORD PTR [rbx+2464] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vmovdqu YMMWORD PTR [rdi+256], ymm0 + vmovdqu YMMWORD PTR [rdi+288], ymm1 + vmovdqu YMMWORD PTR [rdi+320], ymm2 + vmovdqu YMMWORD PTR [rdi+352], ymm3 + vmovdqu YMMWORD PTR [rdi+384], ymm4 + vmovdqu YMMWORD PTR [rdi+416], ymm5 + vmovdqu YMMWORD PTR [rdi+448], ymm6 + vmovdqu YMMWORD PTR [rdi+480], ymm7 + add rdi, 512 + sub r15, 1 + jg L_mlkem_encapsulate_avx2_trans + movsxd r14, r13d +L_mlkem_encapsulate_avx2_calc: + vmovdqu ymm12, YMMWORD PTR mlkem_qinv + ; Pointwise acc mont + movsxd r15, r13d + ; Base mul mont + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [r9] + vmovdqu ymm3, YMMWORD PTR [r9+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax] + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx] + vmovdqu ymm11, YMMWORD PTR [rbx+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+64] + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+64] + vmovdqu ymm11, YMMWORD PTR [rbx+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+128] + vmovdqu ymm3, YMMWORD PTR [r9+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+128] + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm11, YMMWORD PTR [rbx+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+192] + vmovdqu ymm3, YMMWORD PTR [r9+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+192] + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+192] + vmovdqu ymm11, YMMWORD PTR [rbx+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+192], ymm0 + vmovdqu YMMWORD PTR [rdx+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+256] + vmovdqu ymm3, YMMWORD PTR [r9+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+256] + vmovdqu ymm5, YMMWORD PTR [rax+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm11, YMMWORD PTR [rbx+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+320] + vmovdqu ymm3, YMMWORD PTR [r9+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+320] + vmovdqu ymm5, YMMWORD PTR [rax+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+320] + vmovdqu ymm11, YMMWORD PTR [rbx+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+320], ymm0 + vmovdqu YMMWORD PTR [rdx+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+384] + vmovdqu ymm3, YMMWORD PTR [r9+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+384] + vmovdqu ymm5, YMMWORD PTR [rax+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vmovdqu ymm11, YMMWORD PTR [rbx+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+448] + vmovdqu ymm3, YMMWORD PTR [r9+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+448] + vmovdqu ymm5, YMMWORD PTR [rax+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+448] + vmovdqu ymm11, YMMWORD PTR [rbx+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+448], ymm0 + vmovdqu YMMWORD PTR [rdx+480], ymm1 + add r9, 512 + add rax, 512 + sub r15, 2 + jz L_pointwise_acc_mont_end_encap_bp +L_pointwise_acc_mont_start_encap_bp: + ; Base mul mont add + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [r9] + vmovdqu ymm3, YMMWORD PTR [r9+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax] + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx] + vmovdqu ymm11, YMMWORD PTR [rbx+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx] + vmovdqu ymm7, YMMWORD PTR [rdx+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+64] + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+64] + vmovdqu ymm11, YMMWORD PTR [rbx+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+64] + vmovdqu ymm7, YMMWORD PTR [rdx+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+128] + vmovdqu ymm3, YMMWORD PTR [r9+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+128] + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm11, YMMWORD PTR [rbx+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+128] + vmovdqu ymm7, YMMWORD PTR [rdx+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+192] + vmovdqu ymm3, YMMWORD PTR [r9+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+192] + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+192] + vmovdqu ymm11, YMMWORD PTR [rbx+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+192] + vmovdqu ymm7, YMMWORD PTR [rdx+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+192], ymm0 + vmovdqu YMMWORD PTR [rdx+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+256] + vmovdqu ymm3, YMMWORD PTR [r9+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+256] + vmovdqu ymm5, YMMWORD PTR [rax+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm11, YMMWORD PTR [rbx+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+256] + vmovdqu ymm7, YMMWORD PTR [rdx+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+320] + vmovdqu ymm3, YMMWORD PTR [r9+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+320] + vmovdqu ymm5, YMMWORD PTR [rax+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+320] + vmovdqu ymm11, YMMWORD PTR [rbx+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+320] + vmovdqu ymm7, YMMWORD PTR [rdx+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+320], ymm0 + vmovdqu YMMWORD PTR [rdx+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+384] + vmovdqu ymm3, YMMWORD PTR [r9+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+384] + vmovdqu ymm5, YMMWORD PTR [rax+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vmovdqu ymm11, YMMWORD PTR [rbx+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+384] + vmovdqu ymm7, YMMWORD PTR [rdx+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+448] + vmovdqu ymm3, YMMWORD PTR [r9+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+448] + vmovdqu ymm5, YMMWORD PTR [rax+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+448] + vmovdqu ymm11, YMMWORD PTR [rbx+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+448] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+448], ymm0 + vmovdqu YMMWORD PTR [rdx+480], ymm1 + add r9, 512 + add rax, 512 + sub r15, 1 + jg L_pointwise_acc_mont_start_encap_bp +L_pointwise_acc_mont_end_encap_bp: + ; Base mul mont add + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [r9] + vmovdqu ymm3, YMMWORD PTR [r9+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax] + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx] + vmovdqu ymm11, YMMWORD PTR [rbx+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx] + vmovdqu ymm7, YMMWORD PTR [rdx+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+64] + vmovdqu ymm3, YMMWORD PTR [r9+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+64] + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+64] + vmovdqu ymm11, YMMWORD PTR [rbx+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+64] + vmovdqu ymm7, YMMWORD PTR [rdx+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+128] + vmovdqu ymm3, YMMWORD PTR [r9+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+128] + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm11, YMMWORD PTR [rbx+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+128] + vmovdqu ymm7, YMMWORD PTR [rdx+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+192] + vmovdqu ymm3, YMMWORD PTR [r9+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+192] + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+192] + vmovdqu ymm11, YMMWORD PTR [rbx+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+192] + vmovdqu ymm7, YMMWORD PTR [rdx+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+192], ymm0 + vmovdqu YMMWORD PTR [rdx+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+256] + vmovdqu ymm3, YMMWORD PTR [r9+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+256] + vmovdqu ymm5, YMMWORD PTR [rax+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm11, YMMWORD PTR [rbx+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+256] + vmovdqu ymm7, YMMWORD PTR [rdx+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+320] + vmovdqu ymm3, YMMWORD PTR [r9+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+320] + vmovdqu ymm5, YMMWORD PTR [rax+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+320] + vmovdqu ymm11, YMMWORD PTR [rbx+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+320] + vmovdqu ymm7, YMMWORD PTR [rdx+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+320], ymm0 + vmovdqu YMMWORD PTR [rdx+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+384] + vmovdqu ymm3, YMMWORD PTR [r9+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+384] + vmovdqu ymm5, YMMWORD PTR [rax+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vmovdqu ymm11, YMMWORD PTR [rbx+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+384] + vmovdqu ymm7, YMMWORD PTR [rdx+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [r9+448] + vmovdqu ymm3, YMMWORD PTR [r9+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+448] + vmovdqu ymm5, YMMWORD PTR [rax+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+448] + vmovdqu ymm11, YMMWORD PTR [rbx+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+448] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+448], ymm0 + vmovdqu YMMWORD PTR [rdx+480], ymm1 + add r9, 512 + mov rax, rsi + ; invntt + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_inv] + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [rdx+128] + vmovdqu ymm5, YMMWORD PTR [rdx+160] + vmovdqu ymm6, YMMWORD PTR [rdx+192] + vmovdqu ymm7, YMMWORD PTR [rdx+224] + ; 2: 1/2 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx] + vperm2i128 ymm9, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+32] + vpsllq ymm0, ymm9, 32 + vpsrlq ymm1, ymm8, 32 + vpblendd ymm0, ymm8, ymm0, 170 + vpblendd ymm1, ymm9, ymm1, 85 + vperm2i128 ymm8, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+64] + vperm2i128 ymm9, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+96] + vpsllq ymm2, ymm9, 32 + vpsrlq ymm3, ymm8, 32 + vpblendd ymm2, ymm8, ymm2, 170 + vpblendd ymm3, ymm9, ymm3, 85 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 4: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm12, YMMWORD PTR [rbx+160] + vmovdqu ymm11, YMMWORD PTR [rbx+192] + vmovdqu ymm13, YMMWORD PTR [rbx+224] + vpunpckldq ymm0, ymm8, ymm1 + vpunpckhdq ymm1, ymm8, ymm1 + vpunpckldq ymm2, ymm9, ymm3 + vpunpckhdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 8: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm12, YMMWORD PTR [rbx+288] + vmovdqu ymm11, YMMWORD PTR [rbx+320] + vmovdqu ymm13, YMMWORD PTR [rbx+352] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 16: 1/2 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+416] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+448] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+480] + vpsubw ymm8, ymm0, ymm1 + vpsubw ymm9, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpmullw ymm1, ymm8, ymm12 + vpmullw ymm3, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm1, ymm1, ymm14 + vpmulhw ymm3, ymm3, ymm14 + vpsubw ymm1, ymm8, ymm1 + vpsubw ymm3, ymm9, ymm3 + ; 32: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+512] + vmovdqu ymm12, YMMWORD PTR [rbx+544] + vpaddw ymm8, ymm0, ymm2 + vpaddw ymm9, ymm1, ymm3 + vpsubw ymm2, ymm0, ymm2 + vpsubw ymm3, ymm1, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm1, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + ; 2: 1/2 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+576] + vperm2i128 ymm9, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+608] + vpsllq ymm4, ymm9, 32 + vpsrlq ymm5, ymm8, 32 + vpblendd ymm4, ymm8, ymm4, 170 + vpblendd ymm5, ymm9, ymm5, 85 + vperm2i128 ymm8, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+640] + vperm2i128 ymm9, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+672] + vpsllq ymm6, ymm9, 32 + vpsrlq ymm7, ymm8, 32 + vpblendd ymm6, ymm8, ymm6, 170 + vpblendd ymm7, ymm9, ymm7, 85 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 4: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+704] + vmovdqu ymm12, YMMWORD PTR [rbx+736] + vmovdqu ymm11, YMMWORD PTR [rbx+768] + vmovdqu ymm13, YMMWORD PTR [rbx+800] + vpunpckldq ymm4, ymm8, ymm5 + vpunpckhdq ymm5, ymm8, ymm5 + vpunpckldq ymm6, ymm9, ymm7 + vpunpckhdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 8: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+832] + vmovdqu ymm12, YMMWORD PTR [rbx+864] + vmovdqu ymm11, YMMWORD PTR [rbx+896] + vmovdqu ymm13, YMMWORD PTR [rbx+928] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 16: 1/2 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+960] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+992] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1024] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1056] + vpsubw ymm8, ymm4, ymm5 + vpsubw ymm9, ymm6, ymm7 + vpaddw ymm4, ymm4, ymm5 + vpaddw ymm6, ymm6, ymm7 + vpmullw ymm5, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm5, ymm5, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm5, ymm8, ymm5 + vpsubw ymm7, ymm9, ymm7 + ; 32: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1088] + vmovdqu ymm12, YMMWORD PTR [rbx+1120] + vpaddw ymm8, ymm4, ymm6 + vpaddw ymm9, ymm5, ymm7 + vpsubw ymm6, ymm4, ymm6 + vpsubw ymm7, ymm5, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm5, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm5, ymm5, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + ; 64: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1152] + vmovdqu ymm12, YMMWORD PTR [rbx+1184] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpsubw ymm8, ymm2, ymm6 + vpsubw ymm9, ymm3, ymm7 + vpaddw ymm2, ymm2, ymm6 + vpaddw ymm3, ymm3, ymm7 + vpmullw ymm6, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm6, ymm8, ymm6 + vpsubw ymm7, ymm9, ymm7 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + vmovdqu YMMWORD PTR [rdx+160], ymm5 + vmovdqu YMMWORD PTR [rdx+192], ymm6 + vmovdqu YMMWORD PTR [rdx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+320] + vmovdqu ymm3, YMMWORD PTR [rdx+352] + vmovdqu ymm4, YMMWORD PTR [rdx+384] + vmovdqu ymm5, YMMWORD PTR [rdx+416] + vmovdqu ymm6, YMMWORD PTR [rdx+448] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + ; 2: 2/2 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1216] + vperm2i128 ymm9, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+1248] + vpsllq ymm0, ymm9, 32 + vpsrlq ymm1, ymm8, 32 + vpblendd ymm0, ymm8, ymm0, 170 + vpblendd ymm1, ymm9, ymm1, 85 + vperm2i128 ymm8, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1280] + vperm2i128 ymm9, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1312] + vpsllq ymm2, ymm9, 32 + vpsrlq ymm3, ymm8, 32 + vpblendd ymm2, ymm8, ymm2, 170 + vpblendd ymm3, ymm9, ymm3, 85 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 4: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1344] + vmovdqu ymm12, YMMWORD PTR [rbx+1376] + vmovdqu ymm11, YMMWORD PTR [rbx+1408] + vmovdqu ymm13, YMMWORD PTR [rbx+1440] + vpunpckldq ymm0, ymm8, ymm1 + vpunpckhdq ymm1, ymm8, ymm1 + vpunpckldq ymm2, ymm9, ymm3 + vpunpckhdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 8: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1472] + vmovdqu ymm12, YMMWORD PTR [rbx+1504] + vmovdqu ymm11, YMMWORD PTR [rbx+1536] + vmovdqu ymm13, YMMWORD PTR [rbx+1568] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 16: 2/2 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1600] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+1632] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1664] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1696] + vpsubw ymm8, ymm0, ymm1 + vpsubw ymm9, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpmullw ymm1, ymm8, ymm12 + vpmullw ymm3, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm1, ymm1, ymm14 + vpmulhw ymm3, ymm3, ymm14 + vpsubw ymm1, ymm8, ymm1 + vpsubw ymm3, ymm9, ymm3 + ; 32: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1728] + vmovdqu ymm12, YMMWORD PTR [rbx+1760] + vpaddw ymm8, ymm0, ymm2 + vpaddw ymm9, ymm1, ymm3 + vpsubw ymm2, ymm0, ymm2 + vpsubw ymm3, ymm1, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm1, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + ; 2: 2/2 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1792] + vperm2i128 ymm9, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+1824] + vpsllq ymm4, ymm9, 32 + vpsrlq ymm5, ymm8, 32 + vpblendd ymm4, ymm8, ymm4, 170 + vpblendd ymm5, ymm9, ymm5, 85 + vperm2i128 ymm8, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1856] + vperm2i128 ymm9, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1888] + vpsllq ymm6, ymm9, 32 + vpsrlq ymm7, ymm8, 32 + vpblendd ymm6, ymm8, ymm6, 170 + vpblendd ymm7, ymm9, ymm7, 85 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 4: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1920] + vmovdqu ymm12, YMMWORD PTR [rbx+1952] + vmovdqu ymm11, YMMWORD PTR [rbx+1984] + vmovdqu ymm13, YMMWORD PTR [rbx+2016] + vpunpckldq ymm4, ymm8, ymm5 + vpunpckhdq ymm5, ymm8, ymm5 + vpunpckldq ymm6, ymm9, ymm7 + vpunpckhdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 8: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+2048] + vmovdqu ymm12, YMMWORD PTR [rbx+2080] + vmovdqu ymm11, YMMWORD PTR [rbx+2112] + vmovdqu ymm13, YMMWORD PTR [rbx+2144] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 16: 2/2 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+2176] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+2208] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+2240] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+2272] + vpsubw ymm8, ymm4, ymm5 + vpsubw ymm9, ymm6, ymm7 + vpaddw ymm4, ymm4, ymm5 + vpaddw ymm6, ymm6, ymm7 + vpmullw ymm5, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm5, ymm5, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm5, ymm8, ymm5 + vpsubw ymm7, ymm9, ymm7 + ; 32: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+2304] + vmovdqu ymm12, YMMWORD PTR [rbx+2336] + vpaddw ymm8, ymm4, ymm6 + vpaddw ymm9, ymm5, ymm7 + vpsubw ymm6, ymm4, ymm6 + vpsubw ymm7, ymm5, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm5, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm5, ymm5, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + ; 64: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+2368] + vmovdqu ymm12, YMMWORD PTR [rbx+2400] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpsubw ymm8, ymm2, ymm6 + vpsubw ymm9, ymm3, ymm7 + vpaddw ymm2, ymm2, ymm6 + vpaddw ymm3, ymm3, ymm7 + vpmullw ymm6, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm6, ymm8, ymm6 + vpsubw ymm7, ymm9, ymm7 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu YMMWORD PTR [rdx+320], ymm2 + vmovdqu YMMWORD PTR [rdx+352], ymm3 + ; 128 + vmovdqu ymm10, YMMWORD PTR [rbx+2432] + vmovdqu ymm12, YMMWORD PTR [rbx+2464] + vmovdqu ymm11, YMMWORD PTR [rbx+2496] + vmovdqu ymm13, YMMWORD PTR [rbx+2528] + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm1, YMMWORD PTR [rdx+160] + vmovdqu ymm2, YMMWORD PTR [rdx+192] + vmovdqu ymm3, YMMWORD PTR [rdx+224] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpaddw ymm8, ymm2, ymm6 + vpaddw ymm9, ymm3, ymm7 + vpsubw ymm6, ymm2, ymm6 + vpsubw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm8, ymm15 + vpmulhw ymm3, ymm9, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm8, ymm2 + vpsubw ymm3, ymm9, ymm3 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm0, ymm0, ymm11 + vpmulhw ymm1, ymm1, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm0, ymm8 + vpsubw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm2, ymm13 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm2, ymm2, ymm11 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + vpmullw ymm8, ymm4, ymm13 + vpmullw ymm9, ymm5, ymm13 + vpmulhw ymm4, ymm4, ymm11 + vpmulhw ymm5, ymm5, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm4, ymm4, ymm8 + vpsubw ymm5, ymm5, ymm9 + vpmullw ymm8, ymm6, ymm13 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm6, ymm6, ymm11 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu YMMWORD PTR [rdx+192], ymm2 + vmovdqu YMMWORD PTR [rdx+224], ymm3 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + vmovdqu YMMWORD PTR [rdx+416], ymm5 + vmovdqu YMMWORD PTR [rdx+448], ymm6 + vmovdqu YMMWORD PTR [rdx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [rdx+256] + vmovdqu ymm5, YMMWORD PTR [rdx+288] + vmovdqu ymm6, YMMWORD PTR [rdx+320] + vmovdqu ymm7, YMMWORD PTR [rdx+352] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpaddw ymm8, ymm2, ymm6 + vpaddw ymm9, ymm3, ymm7 + vpsubw ymm6, ymm2, ymm6 + vpsubw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm8, ymm15 + vpmulhw ymm3, ymm9, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm8, ymm2 + vpsubw ymm3, ymm9, ymm3 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm0, ymm0, ymm11 + vpmulhw ymm1, ymm1, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm0, ymm8 + vpsubw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm2, ymm13 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm2, ymm2, ymm11 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + vpmullw ymm8, ymm4, ymm13 + vpmullw ymm9, ymm5, ymm13 + vpmulhw ymm4, ymm4, ymm11 + vpmulhw ymm5, ymm5, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm4, ymm4, ymm8 + vpsubw ymm5, ymm5, ymm9 + vpmullw ymm8, ymm6, ymm13 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm6, ymm6, ymm11 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vmovdqu YMMWORD PTR [rdx+256], ymm4 + vmovdqu YMMWORD PTR [rdx+288], ymm5 + vmovdqu YMMWORD PTR [rdx+320], ymm6 + vmovdqu YMMWORD PTR [rdx+352], ymm7 + ; Add Errors + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [r10] + vmovdqu ymm5, YMMWORD PTR [r10+32] + vmovdqu ymm6, YMMWORD PTR [r10+64] + vmovdqu ymm7, YMMWORD PTR [r10+96] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm1, YMMWORD PTR [rdx+160] + vmovdqu ymm2, YMMWORD PTR [rdx+192] + vmovdqu ymm3, YMMWORD PTR [rdx+224] + vmovdqu ymm4, YMMWORD PTR [r10+128] + vmovdqu ymm5, YMMWORD PTR [r10+160] + vmovdqu ymm6, YMMWORD PTR [r10+192] + vmovdqu ymm7, YMMWORD PTR [r10+224] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu YMMWORD PTR [rdx+192], ymm2 + vmovdqu YMMWORD PTR [rdx+224], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+320] + vmovdqu ymm3, YMMWORD PTR [rdx+352] + vmovdqu ymm4, YMMWORD PTR [r10+256] + vmovdqu ymm5, YMMWORD PTR [r10+288] + vmovdqu ymm6, YMMWORD PTR [r10+320] + vmovdqu ymm7, YMMWORD PTR [r10+352] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu YMMWORD PTR [rdx+320], ymm2 + vmovdqu YMMWORD PTR [rdx+352], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm1, YMMWORD PTR [rdx+416] + vmovdqu ymm2, YMMWORD PTR [rdx+448] + vmovdqu ymm3, YMMWORD PTR [rdx+480] + vmovdqu ymm4, YMMWORD PTR [r10+384] + vmovdqu ymm5, YMMWORD PTR [r10+416] + vmovdqu ymm6, YMMWORD PTR [r10+448] + vmovdqu ymm7, YMMWORD PTR [r10+480] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu YMMWORD PTR [rdx+448], ymm2 + vmovdqu YMMWORD PTR [rdx+480], ymm3 + add r10, 512 + add rdx, 512 + sub r14, 1 + jg L_mlkem_encapsulate_avx2_calc + vmovdqu ymm12, YMMWORD PTR mlkem_qinv + ; Pointwise acc mont + movsxd r15, r13d + ; Base mul mont + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [rcx] + vmovdqu ymm3, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax] + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx] + vmovdqu ymm11, YMMWORD PTR [rbx+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+64] + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+64] + vmovdqu ymm11, YMMWORD PTR [rbx+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+128] + vmovdqu ymm3, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+128] + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm11, YMMWORD PTR [rbx+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+192] + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+192] + vmovdqu ymm11, YMMWORD PTR [rbx+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8+192], ymm0 + vmovdqu YMMWORD PTR [r8+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+256] + vmovdqu ymm5, YMMWORD PTR [rax+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm11, YMMWORD PTR [rbx+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8+256], ymm0 + vmovdqu YMMWORD PTR [r8+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+320] + vmovdqu ymm5, YMMWORD PTR [rax+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+320] + vmovdqu ymm11, YMMWORD PTR [rbx+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8+320], ymm0 + vmovdqu YMMWORD PTR [r8+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+384] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+384] + vmovdqu ymm5, YMMWORD PTR [rax+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vmovdqu ymm11, YMMWORD PTR [rbx+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8+384], ymm0 + vmovdqu YMMWORD PTR [r8+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+448] + vmovdqu ymm5, YMMWORD PTR [rax+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+448] + vmovdqu ymm11, YMMWORD PTR [rbx+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [r8+448], ymm0 + vmovdqu YMMWORD PTR [r8+480], ymm1 + add rcx, 512 + add rax, 512 + sub r15, 2 + jz L_pointwise_acc_mont_end_encap_v +L_pointwise_acc_mont_start_encap_v: + ; Base mul mont add + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [rcx] + vmovdqu ymm3, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax] + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx] + vmovdqu ymm11, YMMWORD PTR [rbx+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8] + vmovdqu ymm7, YMMWORD PTR [r8+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+64] + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+64] + vmovdqu ymm11, YMMWORD PTR [rbx+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+64] + vmovdqu ymm7, YMMWORD PTR [r8+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+128] + vmovdqu ymm3, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+128] + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm11, YMMWORD PTR [rbx+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+128] + vmovdqu ymm7, YMMWORD PTR [r8+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+192] + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+192] + vmovdqu ymm11, YMMWORD PTR [rbx+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+192] + vmovdqu ymm7, YMMWORD PTR [r8+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8+192], ymm0 + vmovdqu YMMWORD PTR [r8+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+256] + vmovdqu ymm5, YMMWORD PTR [rax+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm11, YMMWORD PTR [rbx+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+256] + vmovdqu ymm7, YMMWORD PTR [r8+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8+256], ymm0 + vmovdqu YMMWORD PTR [r8+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+320] + vmovdqu ymm5, YMMWORD PTR [rax+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+320] + vmovdqu ymm11, YMMWORD PTR [rbx+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+320] + vmovdqu ymm7, YMMWORD PTR [r8+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8+320], ymm0 + vmovdqu YMMWORD PTR [r8+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+384] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+384] + vmovdqu ymm5, YMMWORD PTR [rax+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vmovdqu ymm11, YMMWORD PTR [rbx+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+384] + vmovdqu ymm7, YMMWORD PTR [r8+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8+384], ymm0 + vmovdqu YMMWORD PTR [r8+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+448] + vmovdqu ymm5, YMMWORD PTR [rax+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+448] + vmovdqu ymm11, YMMWORD PTR [rbx+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm7, YMMWORD PTR [r8+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [r8+448], ymm0 + vmovdqu YMMWORD PTR [r8+480], ymm1 + add rcx, 512 + add rax, 512 + sub r15, 1 + jg L_pointwise_acc_mont_start_encap_v +L_pointwise_acc_mont_end_encap_v: + ; Base mul mont add + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [rcx] + vmovdqu ymm3, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax] + vmovdqu ymm5, YMMWORD PTR [rax+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx] + vmovdqu ymm11, YMMWORD PTR [rbx+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8] + vmovdqu ymm7, YMMWORD PTR [r8+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+64] + vmovdqu ymm5, YMMWORD PTR [rax+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+64] + vmovdqu ymm11, YMMWORD PTR [rbx+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+64] + vmovdqu ymm7, YMMWORD PTR [r8+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8+64], ymm0 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+128] + vmovdqu ymm3, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+128] + vmovdqu ymm5, YMMWORD PTR [rax+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm11, YMMWORD PTR [rbx+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+128] + vmovdqu ymm7, YMMWORD PTR [r8+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+192] + vmovdqu ymm5, YMMWORD PTR [rax+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+192] + vmovdqu ymm11, YMMWORD PTR [rbx+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+192] + vmovdqu ymm7, YMMWORD PTR [r8+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8+192], ymm0 + vmovdqu YMMWORD PTR [r8+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+256] + vmovdqu ymm5, YMMWORD PTR [rax+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm11, YMMWORD PTR [rbx+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+256] + vmovdqu ymm7, YMMWORD PTR [r8+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8+256], ymm0 + vmovdqu YMMWORD PTR [r8+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+320] + vmovdqu ymm5, YMMWORD PTR [rax+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+320] + vmovdqu ymm11, YMMWORD PTR [rbx+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+320] + vmovdqu ymm7, YMMWORD PTR [r8+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8+320], ymm0 + vmovdqu YMMWORD PTR [r8+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+384] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+384] + vmovdqu ymm5, YMMWORD PTR [rax+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vmovdqu ymm11, YMMWORD PTR [rbx+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+384] + vmovdqu ymm7, YMMWORD PTR [r8+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8+384], ymm0 + vmovdqu YMMWORD PTR [r8+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [rax+448] + vmovdqu ymm5, YMMWORD PTR [rax+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [rbx+448] + vmovdqu ymm11, YMMWORD PTR [rbx+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm7, YMMWORD PTR [r8+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [r8+448], ymm0 + vmovdqu YMMWORD PTR [r8+480], ymm1 + add rcx, 512 + ; invntt + mov rbx, QWORD PTR [ptr_L_mlkem_avx2_zetas_inv] + vmovdqu ymm0, YMMWORD PTR [r8] + vmovdqu ymm1, YMMWORD PTR [r8+32] + vmovdqu ymm2, YMMWORD PTR [r8+64] + vmovdqu ymm3, YMMWORD PTR [r8+96] + vmovdqu ymm4, YMMWORD PTR [r8+128] + vmovdqu ymm5, YMMWORD PTR [r8+160] + vmovdqu ymm6, YMMWORD PTR [r8+192] + vmovdqu ymm7, YMMWORD PTR [r8+224] + ; 2: 1/2 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx] + vperm2i128 ymm9, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+32] + vpsllq ymm0, ymm9, 32 + vpsrlq ymm1, ymm8, 32 + vpblendd ymm0, ymm8, ymm0, 170 + vpblendd ymm1, ymm9, ymm1, 85 + vperm2i128 ymm8, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+64] + vperm2i128 ymm9, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+96] + vpsllq ymm2, ymm9, 32 + vpsrlq ymm3, ymm8, 32 + vpblendd ymm2, ymm8, ymm2, 170 + vpblendd ymm3, ymm9, ymm3, 85 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 4: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+128] + vmovdqu ymm12, YMMWORD PTR [rbx+160] + vmovdqu ymm11, YMMWORD PTR [rbx+192] + vmovdqu ymm13, YMMWORD PTR [rbx+224] + vpunpckldq ymm0, ymm8, ymm1 + vpunpckhdq ymm1, ymm8, ymm1 + vpunpckldq ymm2, ymm9, ymm3 + vpunpckhdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 8: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+256] + vmovdqu ymm12, YMMWORD PTR [rbx+288] + vmovdqu ymm11, YMMWORD PTR [rbx+320] + vmovdqu ymm13, YMMWORD PTR [rbx+352] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 16: 1/2 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+384] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+416] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+448] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+480] + vpsubw ymm8, ymm0, ymm1 + vpsubw ymm9, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpmullw ymm1, ymm8, ymm12 + vpmullw ymm3, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm1, ymm1, ymm14 + vpmulhw ymm3, ymm3, ymm14 + vpsubw ymm1, ymm8, ymm1 + vpsubw ymm3, ymm9, ymm3 + ; 32: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+512] + vmovdqu ymm12, YMMWORD PTR [rbx+544] + vpaddw ymm8, ymm0, ymm2 + vpaddw ymm9, ymm1, ymm3 + vpsubw ymm2, ymm0, ymm2 + vpsubw ymm3, ymm1, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm1, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + ; 2: 1/2 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+576] + vperm2i128 ymm9, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+608] + vpsllq ymm4, ymm9, 32 + vpsrlq ymm5, ymm8, 32 + vpblendd ymm4, ymm8, ymm4, 170 + vpblendd ymm5, ymm9, ymm5, 85 + vperm2i128 ymm8, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+640] + vperm2i128 ymm9, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+672] + vpsllq ymm6, ymm9, 32 + vpsrlq ymm7, ymm8, 32 + vpblendd ymm6, ymm8, ymm6, 170 + vpblendd ymm7, ymm9, ymm7, 85 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 4: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+704] + vmovdqu ymm12, YMMWORD PTR [rbx+736] + vmovdqu ymm11, YMMWORD PTR [rbx+768] + vmovdqu ymm13, YMMWORD PTR [rbx+800] + vpunpckldq ymm4, ymm8, ymm5 + vpunpckhdq ymm5, ymm8, ymm5 + vpunpckldq ymm6, ymm9, ymm7 + vpunpckhdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 8: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+832] + vmovdqu ymm12, YMMWORD PTR [rbx+864] + vmovdqu ymm11, YMMWORD PTR [rbx+896] + vmovdqu ymm13, YMMWORD PTR [rbx+928] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 16: 1/2 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+960] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+992] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1024] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1056] + vpsubw ymm8, ymm4, ymm5 + vpsubw ymm9, ymm6, ymm7 + vpaddw ymm4, ymm4, ymm5 + vpaddw ymm6, ymm6, ymm7 + vpmullw ymm5, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm5, ymm5, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm5, ymm8, ymm5 + vpsubw ymm7, ymm9, ymm7 + ; 32: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1088] + vmovdqu ymm12, YMMWORD PTR [rbx+1120] + vpaddw ymm8, ymm4, ymm6 + vpaddw ymm9, ymm5, ymm7 + vpsubw ymm6, ymm4, ymm6 + vpsubw ymm7, ymm5, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm5, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm5, ymm5, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + ; 64: 1/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1152] + vmovdqu ymm12, YMMWORD PTR [rbx+1184] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpsubw ymm8, ymm2, ymm6 + vpsubw ymm9, ymm3, ymm7 + vpaddw ymm2, ymm2, ymm6 + vpaddw ymm3, ymm3, ymm7 + vpmullw ymm6, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm6, ymm8, ymm6 + vpsubw ymm7, ymm9, ymm7 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm4 + vmovdqu YMMWORD PTR [r8+160], ymm5 + vmovdqu YMMWORD PTR [r8+192], ymm6 + vmovdqu YMMWORD PTR [r8+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [r8+256] + vmovdqu ymm1, YMMWORD PTR [r8+288] + vmovdqu ymm2, YMMWORD PTR [r8+320] + vmovdqu ymm3, YMMWORD PTR [r8+352] + vmovdqu ymm4, YMMWORD PTR [r8+384] + vmovdqu ymm5, YMMWORD PTR [r8+416] + vmovdqu ymm6, YMMWORD PTR [r8+448] + vmovdqu ymm7, YMMWORD PTR [r8+480] + ; 2: 2/2 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1216] + vperm2i128 ymm9, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+1248] + vpsllq ymm0, ymm9, 32 + vpsrlq ymm1, ymm8, 32 + vpblendd ymm0, ymm8, ymm0, 170 + vpblendd ymm1, ymm9, ymm1, 85 + vperm2i128 ymm8, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1280] + vperm2i128 ymm9, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1312] + vpsllq ymm2, ymm9, 32 + vpsrlq ymm3, ymm8, 32 + vpblendd ymm2, ymm8, ymm2, 170 + vpblendd ymm3, ymm9, ymm3, 85 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 4: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1344] + vmovdqu ymm12, YMMWORD PTR [rbx+1376] + vmovdqu ymm11, YMMWORD PTR [rbx+1408] + vmovdqu ymm13, YMMWORD PTR [rbx+1440] + vpunpckldq ymm0, ymm8, ymm1 + vpunpckhdq ymm1, ymm8, ymm1 + vpunpckldq ymm2, ymm9, ymm3 + vpunpckhdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 8: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1472] + vmovdqu ymm12, YMMWORD PTR [rbx+1504] + vmovdqu ymm11, YMMWORD PTR [rbx+1536] + vmovdqu ymm13, YMMWORD PTR [rbx+1568] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 16: 2/2 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1600] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+1632] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1664] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1696] + vpsubw ymm8, ymm0, ymm1 + vpsubw ymm9, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpmullw ymm1, ymm8, ymm12 + vpmullw ymm3, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm1, ymm1, ymm14 + vpmulhw ymm3, ymm3, ymm14 + vpsubw ymm1, ymm8, ymm1 + vpsubw ymm3, ymm9, ymm3 + ; 32: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1728] + vmovdqu ymm12, YMMWORD PTR [rbx+1760] + vpaddw ymm8, ymm0, ymm2 + vpaddw ymm9, ymm1, ymm3 + vpsubw ymm2, ymm0, ymm2 + vpsubw ymm3, ymm1, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm1, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + ; 2: 2/2 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+1792] + vperm2i128 ymm9, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+1824] + vpsllq ymm4, ymm9, 32 + vpsrlq ymm5, ymm8, 32 + vpblendd ymm4, ymm8, ymm4, 170 + vpblendd ymm5, ymm9, ymm5, 85 + vperm2i128 ymm8, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+1856] + vperm2i128 ymm9, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+1888] + vpsllq ymm6, ymm9, 32 + vpsrlq ymm7, ymm8, 32 + vpblendd ymm6, ymm8, ymm6, 170 + vpblendd ymm7, ymm9, ymm7, 85 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 4: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+1920] + vmovdqu ymm12, YMMWORD PTR [rbx+1952] + vmovdqu ymm11, YMMWORD PTR [rbx+1984] + vmovdqu ymm13, YMMWORD PTR [rbx+2016] + vpunpckldq ymm4, ymm8, ymm5 + vpunpckhdq ymm5, ymm8, ymm5 + vpunpckldq ymm6, ymm9, ymm7 + vpunpckhdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 8: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+2048] + vmovdqu ymm12, YMMWORD PTR [rbx+2080] + vmovdqu ymm11, YMMWORD PTR [rbx+2112] + vmovdqu ymm13, YMMWORD PTR [rbx+2144] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 16: 2/2 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [rbx+2176] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [rbx+2208] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [rbx+2240] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [rbx+2272] + vpsubw ymm8, ymm4, ymm5 + vpsubw ymm9, ymm6, ymm7 + vpaddw ymm4, ymm4, ymm5 + vpaddw ymm6, ymm6, ymm7 + vpmullw ymm5, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm5, ymm5, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm5, ymm8, ymm5 + vpsubw ymm7, ymm9, ymm7 + ; 32: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+2304] + vmovdqu ymm12, YMMWORD PTR [rbx+2336] + vpaddw ymm8, ymm4, ymm6 + vpaddw ymm9, ymm5, ymm7 + vpsubw ymm6, ymm4, ymm6 + vpsubw ymm7, ymm5, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm5, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm5, ymm5, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + ; 64: 2/2 + vmovdqu ymm10, YMMWORD PTR [rbx+2368] + vmovdqu ymm12, YMMWORD PTR [rbx+2400] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpsubw ymm8, ymm2, ymm6 + vpsubw ymm9, ymm3, ymm7 + vpaddw ymm2, ymm2, ymm6 + vpaddw ymm3, ymm3, ymm7 + vpmullw ymm6, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm6, ymm8, ymm6 + vpsubw ymm7, ymm9, ymm7 + vmovdqu YMMWORD PTR [r8+256], ymm0 + vmovdqu YMMWORD PTR [r8+288], ymm1 + vmovdqu YMMWORD PTR [r8+320], ymm2 + vmovdqu YMMWORD PTR [r8+352], ymm3 + ; 128 + vmovdqu ymm10, YMMWORD PTR [rbx+2432] + vmovdqu ymm12, YMMWORD PTR [rbx+2464] + vmovdqu ymm11, YMMWORD PTR [rbx+2496] + vmovdqu ymm13, YMMWORD PTR [rbx+2528] + vmovdqu ymm0, YMMWORD PTR [r8+128] + vmovdqu ymm1, YMMWORD PTR [r8+160] + vmovdqu ymm2, YMMWORD PTR [r8+192] + vmovdqu ymm3, YMMWORD PTR [r8+224] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpaddw ymm8, ymm2, ymm6 + vpaddw ymm9, ymm3, ymm7 + vpsubw ymm6, ymm2, ymm6 + vpsubw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm8, ymm15 + vpmulhw ymm3, ymm9, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm8, ymm2 + vpsubw ymm3, ymm9, ymm3 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm0, ymm0, ymm11 + vpmulhw ymm1, ymm1, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm0, ymm8 + vpsubw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm2, ymm13 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm2, ymm2, ymm11 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + vpmullw ymm8, ymm4, ymm13 + vpmullw ymm9, ymm5, ymm13 + vpmulhw ymm4, ymm4, ymm11 + vpmulhw ymm5, ymm5, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm4, ymm4, ymm8 + vpsubw ymm5, ymm5, ymm9 + vpmullw ymm8, ymm6, ymm13 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm6, ymm6, ymm11 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+160], ymm1 + vmovdqu YMMWORD PTR [r8+192], ymm2 + vmovdqu YMMWORD PTR [r8+224], ymm3 + vmovdqu YMMWORD PTR [r8+384], ymm4 + vmovdqu YMMWORD PTR [r8+416], ymm5 + vmovdqu YMMWORD PTR [r8+448], ymm6 + vmovdqu YMMWORD PTR [r8+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [r8] + vmovdqu ymm1, YMMWORD PTR [r8+32] + vmovdqu ymm2, YMMWORD PTR [r8+64] + vmovdqu ymm3, YMMWORD PTR [r8+96] + vmovdqu ymm4, YMMWORD PTR [r8+256] + vmovdqu ymm5, YMMWORD PTR [r8+288] + vmovdqu ymm6, YMMWORD PTR [r8+320] + vmovdqu ymm7, YMMWORD PTR [r8+352] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpaddw ymm8, ymm2, ymm6 + vpaddw ymm9, ymm3, ymm7 + vpsubw ymm6, ymm2, ymm6 + vpsubw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm8, ymm15 + vpmulhw ymm3, ymm9, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm8, ymm2 + vpsubw ymm3, ymm9, ymm3 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm0, ymm0, ymm11 + vpmulhw ymm1, ymm1, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm0, ymm8 + vpsubw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm2, ymm13 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm2, ymm2, ymm11 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + vpmullw ymm8, ymm4, ymm13 + vpmullw ymm9, ymm5, ymm13 + vpmulhw ymm4, ymm4, ymm11 + vpmulhw ymm5, ymm5, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm4, ymm4, ymm8 + vpsubw ymm5, ymm5, ymm9 + vpmullw ymm8, ymm6, ymm13 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm6, ymm6, ymm11 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu YMMWORD PTR [r8+256], ymm4 + vmovdqu YMMWORD PTR [r8+288], ymm5 + vmovdqu YMMWORD PTR [r8+320], ymm6 + vmovdqu YMMWORD PTR [r8+352], ymm7 + ; Add Errors + vmovdqu ymm0, YMMWORD PTR [r12] + vmovdqu ymm1, YMMWORD PTR [r12+32] + vmovdqu ymm2, YMMWORD PTR [r12+64] + vmovdqu ymm3, YMMWORD PTR [r12+96] + vmovdqu ymm4, YMMWORD PTR [r11] + vmovdqu ymm5, YMMWORD PTR [r11+32] + vmovdqu ymm6, YMMWORD PTR [r11+64] + vmovdqu ymm7, YMMWORD PTR [r11+96] + vpaddw ymm4, ymm4, ymm0 + vpaddw ymm5, ymm5, ymm1 + vpaddw ymm6, ymm6, ymm2 + vpaddw ymm7, ymm7, ymm3 + vmovdqu ymm0, YMMWORD PTR [r8] + vmovdqu ymm1, YMMWORD PTR [r8+32] + vmovdqu ymm2, YMMWORD PTR [r8+64] + vmovdqu ymm3, YMMWORD PTR [r8+96] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [r8], ymm0 + vmovdqu YMMWORD PTR [r8+32], ymm1 + vmovdqu YMMWORD PTR [r8+64], ymm2 + vmovdqu YMMWORD PTR [r8+96], ymm3 + vmovdqu ymm0, YMMWORD PTR [r12+128] + vmovdqu ymm1, YMMWORD PTR [r12+160] + vmovdqu ymm2, YMMWORD PTR [r12+192] + vmovdqu ymm3, YMMWORD PTR [r12+224] + vmovdqu ymm4, YMMWORD PTR [r11+128] + vmovdqu ymm5, YMMWORD PTR [r11+160] + vmovdqu ymm6, YMMWORD PTR [r11+192] + vmovdqu ymm7, YMMWORD PTR [r11+224] + vpaddw ymm4, ymm4, ymm0 + vpaddw ymm5, ymm5, ymm1 + vpaddw ymm6, ymm6, ymm2 + vpaddw ymm7, ymm7, ymm3 + vmovdqu ymm0, YMMWORD PTR [r8+128] + vmovdqu ymm1, YMMWORD PTR [r8+160] + vmovdqu ymm2, YMMWORD PTR [r8+192] + vmovdqu ymm3, YMMWORD PTR [r8+224] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [r8+128], ymm0 + vmovdqu YMMWORD PTR [r8+160], ymm1 + vmovdqu YMMWORD PTR [r8+192], ymm2 + vmovdqu YMMWORD PTR [r8+224], ymm3 + vmovdqu ymm0, YMMWORD PTR [r12+256] + vmovdqu ymm1, YMMWORD PTR [r12+288] + vmovdqu ymm2, YMMWORD PTR [r12+320] + vmovdqu ymm3, YMMWORD PTR [r12+352] + vmovdqu ymm4, YMMWORD PTR [r11+256] + vmovdqu ymm5, YMMWORD PTR [r11+288] + vmovdqu ymm6, YMMWORD PTR [r11+320] + vmovdqu ymm7, YMMWORD PTR [r11+352] + vpaddw ymm4, ymm4, ymm0 + vpaddw ymm5, ymm5, ymm1 + vpaddw ymm6, ymm6, ymm2 + vpaddw ymm7, ymm7, ymm3 + vmovdqu ymm0, YMMWORD PTR [r8+256] + vmovdqu ymm1, YMMWORD PTR [r8+288] + vmovdqu ymm2, YMMWORD PTR [r8+320] + vmovdqu ymm3, YMMWORD PTR [r8+352] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [r8+256], ymm0 + vmovdqu YMMWORD PTR [r8+288], ymm1 + vmovdqu YMMWORD PTR [r8+320], ymm2 + vmovdqu YMMWORD PTR [r8+352], ymm3 + vmovdqu ymm0, YMMWORD PTR [r12+384] + vmovdqu ymm1, YMMWORD PTR [r12+416] + vmovdqu ymm2, YMMWORD PTR [r12+448] + vmovdqu ymm3, YMMWORD PTR [r12+480] + vmovdqu ymm4, YMMWORD PTR [r11+384] + vmovdqu ymm5, YMMWORD PTR [r11+416] + vmovdqu ymm6, YMMWORD PTR [r11+448] + vmovdqu ymm7, YMMWORD PTR [r11+480] + vpaddw ymm4, ymm4, ymm0 + vpaddw ymm5, ymm5, ymm1 + vpaddw ymm6, ymm6, ymm2 + vpaddw ymm7, ymm7, ymm3 + vmovdqu ymm0, YMMWORD PTR [r8+384] + vmovdqu ymm1, YMMWORD PTR [r8+416] + vmovdqu ymm2, YMMWORD PTR [r8+448] + vmovdqu ymm3, YMMWORD PTR [r8+480] + vpaddw ymm4, ymm0, ymm4 + vpaddw ymm5, ymm1, ymm5 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpaddw ymm6, ymm2, ymm6 + vpaddw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [r8+384], ymm0 + vmovdqu YMMWORD PTR [r8+416], ymm1 + vmovdqu YMMWORD PTR [r8+448], ymm2 + vmovdqu YMMWORD PTR [r8+480], ymm3 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp+48] + vmovdqu xmm7, OWORD PTR [rsp+64] + vmovdqu xmm8, OWORD PTR [rsp+80] + vmovdqu xmm9, OWORD PTR [rsp+96] + vmovdqu xmm10, OWORD PTR [rsp+112] + vmovdqu xmm11, OWORD PTR [rsp+128] + vmovdqu xmm12, OWORD PTR [rsp+144] + vmovdqu xmm13, OWORD PTR [rsp+160] + vmovdqu xmm14, OWORD PTR [rsp+176] + vmovdqu xmm15, OWORD PTR [rsp+192] + add rsp, 208 + pop rbx + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +mlkem_encapsulate_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_decapsulate_avx2 PROC + push r12 + mov rax, QWORD PTR [rsp+48] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm14, YMMWORD PTR mlkem_q + vmovdqu ymm15, YMMWORD PTR mlkem_v + movsxd r10, eax + mov r11, r8 +L_mlkem_decapsulate_avx2_trans: + ; ntt + mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas] + vmovdqu ymm10, YMMWORD PTR [r12] + vmovdqu ymm12, YMMWORD PTR [r12+32] + vmovdqu ymm0, YMMWORD PTR [r11+128] + vmovdqu ymm1, YMMWORD PTR [r11+160] + vmovdqu ymm2, YMMWORD PTR [r11+192] + vmovdqu ymm3, YMMWORD PTR [r11+224] + vmovdqu ymm4, YMMWORD PTR [r11+384] + vmovdqu ymm5, YMMWORD PTR [r11+416] + vmovdqu ymm6, YMMWORD PTR [r11+448] + vmovdqu ymm7, YMMWORD PTR [r11+480] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [r11+128], ymm0 + vmovdqu YMMWORD PTR [r11+160], ymm1 + vmovdqu YMMWORD PTR [r11+192], ymm2 + vmovdqu YMMWORD PTR [r11+224], ymm3 + vmovdqu YMMWORD PTR [r11+384], ymm4 + vmovdqu YMMWORD PTR [r11+416], ymm5 + vmovdqu YMMWORD PTR [r11+448], ymm6 + vmovdqu YMMWORD PTR [r11+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [r11] + vmovdqu ymm1, YMMWORD PTR [r11+32] + vmovdqu ymm2, YMMWORD PTR [r11+64] + vmovdqu ymm3, YMMWORD PTR [r11+96] + vmovdqu ymm4, YMMWORD PTR [r11+256] + vmovdqu ymm5, YMMWORD PTR [r11+288] + vmovdqu ymm6, YMMWORD PTR [r11+320] + vmovdqu ymm7, YMMWORD PTR [r11+352] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + vmovdqu YMMWORD PTR [r11+256], ymm4 + vmovdqu YMMWORD PTR [r11+288], ymm5 + vmovdqu YMMWORD PTR [r11+320], ymm6 + vmovdqu YMMWORD PTR [r11+352], ymm7 + vmovdqu ymm4, YMMWORD PTR [r11+128] + vmovdqu ymm5, YMMWORD PTR [r11+160] + vmovdqu ymm6, YMMWORD PTR [r11+192] + vmovdqu ymm7, YMMWORD PTR [r11+224] + ; 64: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+64] + vmovdqu ymm12, YMMWORD PTR [r12+96] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+128] + vmovdqu ymm12, YMMWORD PTR [r12+160] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+192] + vmovdqu ymm12, YMMWORD PTR [r12+224] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+256] + vmovdqu ymm12, YMMWORD PTR [r12+288] + vmovdqu ymm11, YMMWORD PTR [r12+320] + vmovdqu ymm13, YMMWORD PTR [r12+352] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+384] + vmovdqu ymm12, YMMWORD PTR [r12+416] + vmovdqu ymm11, YMMWORD PTR [r12+448] + vmovdqu ymm13, YMMWORD PTR [r12+480] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r12+512] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r12+544] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r12+576] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r12+608] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+640] + vmovdqu ymm12, YMMWORD PTR [r12+672] + vmovdqu ymm11, YMMWORD PTR [r12+704] + vmovdqu ymm13, YMMWORD PTR [r12+736] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 0/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r12+768] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r12+800] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r12+832] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r12+864] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+896] + vmovdqu ymm12, YMMWORD PTR [r12+928] + vmovdqu ymm11, YMMWORD PTR [r12+960] + vmovdqu ymm13, YMMWORD PTR [r12+992] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+1024] + vmovdqu ymm12, YMMWORD PTR [r12+1056] + vmovdqu ymm11, YMMWORD PTR [r12+1088] + vmovdqu ymm13, YMMWORD PTR [r12+1120] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 0/3 + vmovdqu ymm10, YMMWORD PTR [r12+1152] + vmovdqu ymm12, YMMWORD PTR [r12+1184] + vmovdqu ymm11, YMMWORD PTR [r12+1216] + vmovdqu ymm13, YMMWORD PTR [r12+1248] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vpmulhw ymm8, ymm0, ymm15 + vpmulhw ymm9, ymm1, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm0, ymm8 + vpsubw ymm9, ymm1, ymm9 + vmovdqu YMMWORD PTR [r11], ymm8 + vmovdqu YMMWORD PTR [r11+32], ymm9 + vpmulhw ymm8, ymm2, ymm15 + vpmulhw ymm9, ymm3, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vmovdqu YMMWORD PTR [r11+64], ymm8 + vmovdqu YMMWORD PTR [r11+96], ymm9 + vpmulhw ymm8, ymm4, ymm15 + vpmulhw ymm9, ymm5, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vmovdqu YMMWORD PTR [r11+128], ymm8 + vmovdqu YMMWORD PTR [r11+160], ymm9 + vpmulhw ymm8, ymm6, ymm15 + vpmulhw ymm9, ymm7, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vmovdqu YMMWORD PTR [r11+192], ymm8 + vmovdqu YMMWORD PTR [r11+224], ymm9 + vmovdqu ymm0, YMMWORD PTR [r11+256] + vmovdqu ymm1, YMMWORD PTR [r11+288] + vmovdqu ymm2, YMMWORD PTR [r11+320] + vmovdqu ymm3, YMMWORD PTR [r11+352] + vmovdqu ymm4, YMMWORD PTR [r11+384] + vmovdqu ymm5, YMMWORD PTR [r11+416] + vmovdqu ymm6, YMMWORD PTR [r11+448] + vmovdqu ymm7, YMMWORD PTR [r11+480] + ; 64: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+1280] + vmovdqu ymm12, YMMWORD PTR [r12+1312] + vpmullw ymm8, ymm4, ymm12 + vpmullw ymm9, ymm5, ymm12 + vpmulhw ymm4, ymm4, ymm10 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vpsubw ymm4, ymm0, ymm8 + vpsubw ymm5, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm2, ymm8 + vpsubw ymm7, ymm3, ymm9 + vpaddw ymm2, ymm2, ymm8 + vpaddw ymm3, ymm3, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+1344] + vmovdqu ymm12, YMMWORD PTR [r12+1376] + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm2, ymm0, ymm8 + vpsubw ymm3, ymm1, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + ; 32: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+1408] + vmovdqu ymm12, YMMWORD PTR [r12+1440] + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm6, ymm4, ymm8 + vpsubw ymm7, ymm5, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+1472] + vmovdqu ymm12, YMMWORD PTR [r12+1504] + vmovdqu ymm11, YMMWORD PTR [r12+1536] + vmovdqu ymm13, YMMWORD PTR [r12+1568] + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 16: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+1600] + vmovdqu ymm12, YMMWORD PTR [r12+1632] + vmovdqu ymm11, YMMWORD PTR [r12+1664] + vmovdqu ymm13, YMMWORD PTR [r12+1696] + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r12+1728] + vperm2i128 ymm1, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r12+1760] + vperm2i128 ymm9, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r12+1792] + vperm2i128 ymm3, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r12+1824] + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm0, ymm1, ymm0 + vpsubw ymm2, ymm3, ymm2 + vpsubw ymm1, ymm8, ymm0 + vpsubw ymm3, ymm9, ymm2 + vpaddw ymm8, ymm8, ymm0 + vpaddw ymm9, ymm9, ymm2 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+1856] + vmovdqu ymm12, YMMWORD PTR [r12+1888] + vmovdqu ymm11, YMMWORD PTR [r12+1920] + vmovdqu ymm13, YMMWORD PTR [r12+1952] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 8: 1/3 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r12+1984] + vperm2i128 ymm5, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r12+2016] + vperm2i128 ymm9, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r12+2048] + vperm2i128 ymm7, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r12+2080] + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm4, ymm5, ymm4 + vpsubw ymm6, ymm7, ymm6 + vpsubw ymm5, ymm8, ymm4 + vpsubw ymm7, ymm9, ymm6 + vpaddw ymm8, ymm8, ymm4 + vpaddw ymm9, ymm9, ymm6 + ; 4: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+2112] + vmovdqu ymm12, YMMWORD PTR [r12+2144] + vmovdqu ymm11, YMMWORD PTR [r12+2176] + vmovdqu ymm13, YMMWORD PTR [r12+2208] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+2240] + vmovdqu ymm12, YMMWORD PTR [r12+2272] + vmovdqu ymm11, YMMWORD PTR [r12+2304] + vmovdqu ymm13, YMMWORD PTR [r12+2336] + vpsllq ymm8, ymm1, 32 + vpsrlq ymm9, ymm0, 32 + vpblendd ymm0, ymm0, ymm8, 170 + vpblendd ymm1, ymm1, ymm9, 85 + vpsllq ymm8, ymm3, 32 + vpsrlq ymm9, ymm2, 32 + vpblendd ymm2, ymm2, ymm8, 170 + vpblendd ymm3, ymm3, ymm9, 85 + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm1, ymm8 + vpsubw ymm9, ymm3, ymm9 + vpsubw ymm1, ymm0, ymm8 + vpsubw ymm3, ymm2, ymm9 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm2, ymm2, ymm9 + ; 2: 1/3 + vmovdqu ymm10, YMMWORD PTR [r12+2368] + vmovdqu ymm12, YMMWORD PTR [r12+2400] + vmovdqu ymm11, YMMWORD PTR [r12+2432] + vmovdqu ymm13, YMMWORD PTR [r12+2464] + vpsllq ymm8, ymm5, 32 + vpsrlq ymm9, ymm4, 32 + vpblendd ymm4, ymm4, ymm8, 170 + vpblendd ymm5, ymm5, ymm9, 85 + vpsllq ymm8, ymm7, 32 + vpsrlq ymm9, ymm6, 32 + vpblendd ymm6, ymm6, ymm8, 170 + vpblendd ymm7, ymm7, ymm9, 85 + vpmullw ymm8, ymm5, ymm12 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm5, ymm8 + vpsubw ymm9, ymm7, ymm9 + vpsubw ymm5, ymm4, ymm8 + vpsubw ymm7, ymm6, ymm9 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm6, ymm6, ymm9 + vpunpckldq ymm8, ymm0, ymm1 + vpunpckhdq ymm9, ymm0, ymm1 + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm2, ymm3 + vpunpckhdq ymm9, ymm2, ymm3 + vperm2i128 ymm2, ymm8, ymm9, 32 + vperm2i128 ymm3, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm4, ymm5 + vpunpckhdq ymm9, ymm4, ymm5 + vperm2i128 ymm4, ymm8, ymm9, 32 + vperm2i128 ymm5, ymm8, ymm9, 49 + vpunpckldq ymm8, ymm6, ymm7 + vpunpckhdq ymm9, ymm6, ymm7 + vperm2i128 ymm6, ymm8, ymm9, 32 + vperm2i128 ymm7, ymm8, ymm9, 49 + vpmulhw ymm8, ymm0, ymm15 + vpmulhw ymm9, ymm1, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm0, ymm8 + vpsubw ymm9, ymm1, ymm9 + vmovdqu YMMWORD PTR [r11+256], ymm8 + vmovdqu YMMWORD PTR [r11+288], ymm9 + vpmulhw ymm8, ymm2, ymm15 + vpmulhw ymm9, ymm3, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm2, ymm8 + vpsubw ymm9, ymm3, ymm9 + vmovdqu YMMWORD PTR [r11+320], ymm8 + vmovdqu YMMWORD PTR [r11+352], ymm9 + vpmulhw ymm8, ymm4, ymm15 + vpmulhw ymm9, ymm5, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm4, ymm8 + vpsubw ymm9, ymm5, ymm9 + vmovdqu YMMWORD PTR [r11+384], ymm8 + vmovdqu YMMWORD PTR [r11+416], ymm9 + vpmulhw ymm8, ymm6, ymm15 + vpmulhw ymm9, ymm7, ymm15 + vpsraw ymm8, ymm8, 10 + vpsraw ymm9, ymm9, 10 + vpmullw ymm8, ymm8, ymm14 + vpmullw ymm9, ymm9, ymm14 + vpsubw ymm8, ymm6, ymm8 + vpsubw ymm9, ymm7, ymm9 + vmovdqu YMMWORD PTR [r11+448], ymm8 + vmovdqu YMMWORD PTR [r11+480], ymm9 + add r11, 512 + sub r10, 1 + jg L_mlkem_decapsulate_avx2_trans + vmovdqu ymm12, YMMWORD PTR mlkem_qinv + ; Pointwise acc mont + movsxd r10, eax + ; Base mul mont + mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [rcx] + vmovdqu ymm3, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8] + vmovdqu ymm5, YMMWORD PTR [r8+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12] + vmovdqu ymm11, YMMWORD PTR [r12+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+64] + vmovdqu ymm5, YMMWORD PTR [r8+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+64] + vmovdqu ymm11, YMMWORD PTR [r12+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+128] + vmovdqu ymm3, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+128] + vmovdqu ymm5, YMMWORD PTR [r8+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+128] + vmovdqu ymm11, YMMWORD PTR [r12+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+192] + vmovdqu ymm5, YMMWORD PTR [r8+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+192] + vmovdqu ymm11, YMMWORD PTR [r12+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+192], ymm0 + vmovdqu YMMWORD PTR [rdx+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+256] + vmovdqu ymm5, YMMWORD PTR [r8+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+256] + vmovdqu ymm11, YMMWORD PTR [r12+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+320] + vmovdqu ymm5, YMMWORD PTR [r8+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+320] + vmovdqu ymm11, YMMWORD PTR [r12+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+320], ymm0 + vmovdqu YMMWORD PTR [rdx+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+384] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+384] + vmovdqu ymm5, YMMWORD PTR [r8+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+384] + vmovdqu ymm11, YMMWORD PTR [r12+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+448] + vmovdqu ymm5, YMMWORD PTR [r8+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+448] + vmovdqu ymm11, YMMWORD PTR [r12+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu YMMWORD PTR [rdx+448], ymm0 + vmovdqu YMMWORD PTR [rdx+480], ymm1 + add rcx, 512 + add r8, 512 + sub r10, 2 + jz L_pointwise_acc_mont_end_decap +L_pointwise_acc_mont_start_decap: + ; Base mul mont add + mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [rcx] + vmovdqu ymm3, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8] + vmovdqu ymm5, YMMWORD PTR [r8+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12] + vmovdqu ymm11, YMMWORD PTR [r12+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx] + vmovdqu ymm7, YMMWORD PTR [rdx+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+64] + vmovdqu ymm5, YMMWORD PTR [r8+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+64] + vmovdqu ymm11, YMMWORD PTR [r12+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+64] + vmovdqu ymm7, YMMWORD PTR [rdx+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+128] + vmovdqu ymm3, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+128] + vmovdqu ymm5, YMMWORD PTR [r8+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+128] + vmovdqu ymm11, YMMWORD PTR [r12+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+128] + vmovdqu ymm7, YMMWORD PTR [rdx+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+192] + vmovdqu ymm5, YMMWORD PTR [r8+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+192] + vmovdqu ymm11, YMMWORD PTR [r12+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+192] + vmovdqu ymm7, YMMWORD PTR [rdx+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+192], ymm0 + vmovdqu YMMWORD PTR [rdx+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+256] + vmovdqu ymm5, YMMWORD PTR [r8+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+256] + vmovdqu ymm11, YMMWORD PTR [r12+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+256] + vmovdqu ymm7, YMMWORD PTR [rdx+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+320] + vmovdqu ymm5, YMMWORD PTR [r8+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+320] + vmovdqu ymm11, YMMWORD PTR [r12+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+320] + vmovdqu ymm7, YMMWORD PTR [rdx+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+320], ymm0 + vmovdqu YMMWORD PTR [rdx+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+384] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+384] + vmovdqu ymm5, YMMWORD PTR [r8+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+384] + vmovdqu ymm11, YMMWORD PTR [r12+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+384] + vmovdqu ymm7, YMMWORD PTR [rdx+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+448] + vmovdqu ymm5, YMMWORD PTR [r8+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+448] + vmovdqu ymm11, YMMWORD PTR [r12+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+448] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vmovdqu YMMWORD PTR [rdx+448], ymm0 + vmovdqu YMMWORD PTR [rdx+480], ymm1 + add rcx, 512 + add r8, 512 + sub r10, 1 + jg L_pointwise_acc_mont_start_decap +L_pointwise_acc_mont_end_decap: + ; Base mul mont add + mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_basemul] + vmovdqu ymm2, YMMWORD PTR [rcx] + vmovdqu ymm3, YMMWORD PTR [rcx+32] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8] + vmovdqu ymm5, YMMWORD PTR [r8+32] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12] + vmovdqu ymm11, YMMWORD PTR [r12+32] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx] + vmovdqu ymm7, YMMWORD PTR [rdx+32] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+64] + vmovdqu ymm5, YMMWORD PTR [r8+96] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+64] + vmovdqu ymm11, YMMWORD PTR [r12+96] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+64] + vmovdqu ymm7, YMMWORD PTR [rdx+96] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+128] + vmovdqu ymm3, YMMWORD PTR [rcx+160] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+128] + vmovdqu ymm5, YMMWORD PTR [r8+160] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+128] + vmovdqu ymm11, YMMWORD PTR [r12+160] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+128] + vmovdqu ymm7, YMMWORD PTR [rdx+160] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+192] + vmovdqu ymm3, YMMWORD PTR [rcx+224] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+192] + vmovdqu ymm5, YMMWORD PTR [r8+224] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+192] + vmovdqu ymm11, YMMWORD PTR [r12+224] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+192] + vmovdqu ymm7, YMMWORD PTR [rdx+224] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+192], ymm0 + vmovdqu YMMWORD PTR [rdx+224], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+256] + vmovdqu ymm3, YMMWORD PTR [rcx+288] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+256] + vmovdqu ymm5, YMMWORD PTR [r8+288] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+256] + vmovdqu ymm11, YMMWORD PTR [r12+288] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+256] + vmovdqu ymm7, YMMWORD PTR [rdx+288] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+320] + vmovdqu ymm5, YMMWORD PTR [r8+352] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+320] + vmovdqu ymm11, YMMWORD PTR [r12+352] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+320] + vmovdqu ymm7, YMMWORD PTR [rdx+352] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+320], ymm0 + vmovdqu YMMWORD PTR [rdx+352], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+384] + vmovdqu ymm3, YMMWORD PTR [rcx+416] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+384] + vmovdqu ymm5, YMMWORD PTR [r8+416] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+384] + vmovdqu ymm11, YMMWORD PTR [r12+416] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+384] + vmovdqu ymm7, YMMWORD PTR [rdx+416] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vpslld ymm6, ymm3, 16 + vpsrld ymm7, ymm2, 16 + vpblendw ymm2, ymm2, ymm6, 170 + vpblendw ymm3, ymm3, ymm7, 85 + vmovdqu ymm4, YMMWORD PTR [r8+448] + vmovdqu ymm5, YMMWORD PTR [r8+480] + vpslld ymm6, ymm5, 16 + vpsrld ymm7, ymm4, 16 + vpblendw ymm4, ymm4, ymm6, 170 + vpblendw ymm5, ymm5, ymm7, 85 + vmovdqu ymm10, YMMWORD PTR [r12+448] + vmovdqu ymm11, YMMWORD PTR [r12+480] + vpmullw ymm0, ymm3, ymm5 + vpmulhw ymm6, ymm3, ymm5 + vpmullw ymm1, ymm2, ymm4 + vpmulhw ymm7, ymm2, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm0, ymm12 + vpmullw ymm9, ymm1, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm6, ymm8 + vpsubw ymm1, ymm7, ymm9 + vpmullw ymm6, ymm0, ymm11 + vpmulhw ymm7, ymm0, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm0, ymm7, ymm6 + vpaddw ymm0, ymm0, ymm1 + vpmullw ymm1, ymm2, ymm5 + vpmulhw ymm6, ymm2, ymm5 + vpmullw ymm2, ymm3, ymm4 + vpmulhw ymm7, ymm3, ymm4 + ; Mont Reduce + vpmullw ymm8, ymm1, ymm12 + vpmullw ymm9, ymm2, ymm12 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm1, ymm6, ymm8 + vpsubw ymm2, ymm7, ymm9 + vpaddw ymm1, ymm1, ymm2 + vmovdqu ymm6, YMMWORD PTR [rdx+448] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + vpaddw ymm0, ymm0, ymm6 + vpaddw ymm1, ymm1, ymm7 + vpslld ymm6, ymm1, 16 + vpsrld ymm7, ymm0, 16 + vpblendw ymm0, ymm0, ymm6, 170 + vpblendw ymm1, ymm1, ymm7, 85 + vmovdqu YMMWORD PTR [rdx+448], ymm0 + vmovdqu YMMWORD PTR [rdx+480], ymm1 + add rcx, 512 + ; invntt + mov r12, QWORD PTR [ptr_L_mlkem_avx2_zetas_inv] + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [rdx+128] + vmovdqu ymm5, YMMWORD PTR [rdx+160] + vmovdqu ymm6, YMMWORD PTR [rdx+192] + vmovdqu ymm7, YMMWORD PTR [rdx+224] + ; 2: 1/2 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r12] + vperm2i128 ymm9, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r12+32] + vpsllq ymm0, ymm9, 32 + vpsrlq ymm1, ymm8, 32 + vpblendd ymm0, ymm8, ymm0, 170 + vpblendd ymm1, ymm9, ymm1, 85 + vperm2i128 ymm8, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r12+64] + vperm2i128 ymm9, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r12+96] + vpsllq ymm2, ymm9, 32 + vpsrlq ymm3, ymm8, 32 + vpblendd ymm2, ymm8, ymm2, 170 + vpblendd ymm3, ymm9, ymm3, 85 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 4: 1/2 + vmovdqu ymm10, YMMWORD PTR [r12+128] + vmovdqu ymm12, YMMWORD PTR [r12+160] + vmovdqu ymm11, YMMWORD PTR [r12+192] + vmovdqu ymm13, YMMWORD PTR [r12+224] + vpunpckldq ymm0, ymm8, ymm1 + vpunpckhdq ymm1, ymm8, ymm1 + vpunpckldq ymm2, ymm9, ymm3 + vpunpckhdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 8: 1/2 + vmovdqu ymm10, YMMWORD PTR [r12+256] + vmovdqu ymm12, YMMWORD PTR [r12+288] + vmovdqu ymm11, YMMWORD PTR [r12+320] + vmovdqu ymm13, YMMWORD PTR [r12+352] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 16: 1/2 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r12+384] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r12+416] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r12+448] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r12+480] + vpsubw ymm8, ymm0, ymm1 + vpsubw ymm9, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpmullw ymm1, ymm8, ymm12 + vpmullw ymm3, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm1, ymm1, ymm14 + vpmulhw ymm3, ymm3, ymm14 + vpsubw ymm1, ymm8, ymm1 + vpsubw ymm3, ymm9, ymm3 + ; 32: 1/2 + vmovdqu ymm10, YMMWORD PTR [r12+512] + vmovdqu ymm12, YMMWORD PTR [r12+544] + vpaddw ymm8, ymm0, ymm2 + vpaddw ymm9, ymm1, ymm3 + vpsubw ymm2, ymm0, ymm2 + vpsubw ymm3, ymm1, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm1, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + ; 2: 1/2 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r12+576] + vperm2i128 ymm9, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r12+608] + vpsllq ymm4, ymm9, 32 + vpsrlq ymm5, ymm8, 32 + vpblendd ymm4, ymm8, ymm4, 170 + vpblendd ymm5, ymm9, ymm5, 85 + vperm2i128 ymm8, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r12+640] + vperm2i128 ymm9, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r12+672] + vpsllq ymm6, ymm9, 32 + vpsrlq ymm7, ymm8, 32 + vpblendd ymm6, ymm8, ymm6, 170 + vpblendd ymm7, ymm9, ymm7, 85 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 4: 1/2 + vmovdqu ymm10, YMMWORD PTR [r12+704] + vmovdqu ymm12, YMMWORD PTR [r12+736] + vmovdqu ymm11, YMMWORD PTR [r12+768] + vmovdqu ymm13, YMMWORD PTR [r12+800] + vpunpckldq ymm4, ymm8, ymm5 + vpunpckhdq ymm5, ymm8, ymm5 + vpunpckldq ymm6, ymm9, ymm7 + vpunpckhdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 8: 1/2 + vmovdqu ymm10, YMMWORD PTR [r12+832] + vmovdqu ymm12, YMMWORD PTR [r12+864] + vmovdqu ymm11, YMMWORD PTR [r12+896] + vmovdqu ymm13, YMMWORD PTR [r12+928] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 16: 1/2 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r12+960] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r12+992] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r12+1024] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r12+1056] + vpsubw ymm8, ymm4, ymm5 + vpsubw ymm9, ymm6, ymm7 + vpaddw ymm4, ymm4, ymm5 + vpaddw ymm6, ymm6, ymm7 + vpmullw ymm5, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm5, ymm5, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm5, ymm8, ymm5 + vpsubw ymm7, ymm9, ymm7 + ; 32: 1/2 + vmovdqu ymm10, YMMWORD PTR [r12+1088] + vmovdqu ymm12, YMMWORD PTR [r12+1120] + vpaddw ymm8, ymm4, ymm6 + vpaddw ymm9, ymm5, ymm7 + vpsubw ymm6, ymm4, ymm6 + vpsubw ymm7, ymm5, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm5, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm5, ymm5, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + ; 64: 1/2 + vmovdqu ymm10, YMMWORD PTR [r12+1152] + vmovdqu ymm12, YMMWORD PTR [r12+1184] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpsubw ymm8, ymm2, ymm6 + vpsubw ymm9, ymm3, ymm7 + vpaddw ymm2, ymm2, ymm6 + vpaddw ymm3, ymm3, ymm7 + vpmullw ymm6, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm6, ymm8, ymm6 + vpsubw ymm7, ymm9, ymm7 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + vmovdqu YMMWORD PTR [rdx+160], ymm5 + vmovdqu YMMWORD PTR [rdx+192], ymm6 + vmovdqu YMMWORD PTR [rdx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+320] + vmovdqu ymm3, YMMWORD PTR [rdx+352] + vmovdqu ymm4, YMMWORD PTR [rdx+384] + vmovdqu ymm5, YMMWORD PTR [rdx+416] + vmovdqu ymm6, YMMWORD PTR [rdx+448] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + ; 2: 2/2 + vperm2i128 ymm8, ymm0, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r12+1216] + vperm2i128 ymm9, ymm0, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r12+1248] + vpsllq ymm0, ymm9, 32 + vpsrlq ymm1, ymm8, 32 + vpblendd ymm0, ymm8, ymm0, 170 + vpblendd ymm1, ymm9, ymm1, 85 + vperm2i128 ymm8, ymm2, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r12+1280] + vperm2i128 ymm9, ymm2, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r12+1312] + vpsllq ymm2, ymm9, 32 + vpsrlq ymm3, ymm8, 32 + vpblendd ymm2, ymm8, ymm2, 170 + vpblendd ymm3, ymm9, ymm3, 85 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 4: 2/2 + vmovdqu ymm10, YMMWORD PTR [r12+1344] + vmovdqu ymm12, YMMWORD PTR [r12+1376] + vmovdqu ymm11, YMMWORD PTR [r12+1408] + vmovdqu ymm13, YMMWORD PTR [r12+1440] + vpunpckldq ymm0, ymm8, ymm1 + vpunpckhdq ymm1, ymm8, ymm1 + vpunpckldq ymm2, ymm9, ymm3 + vpunpckhdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 8: 2/2 + vmovdqu ymm10, YMMWORD PTR [r12+1472] + vmovdqu ymm12, YMMWORD PTR [r12+1504] + vmovdqu ymm11, YMMWORD PTR [r12+1536] + vmovdqu ymm13, YMMWORD PTR [r12+1568] + vpunpcklqdq ymm0, ymm8, ymm1 + vpunpckhqdq ymm1, ymm8, ymm1 + vpunpcklqdq ymm2, ymm9, ymm3 + vpunpckhqdq ymm3, ymm9, ymm3 + vpaddw ymm8, ymm0, ymm1 + vpaddw ymm9, ymm2, ymm3 + vpsubw ymm1, ymm0, ymm1 + vpsubw ymm3, ymm2, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm2, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm2, ymm2, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm2, ymm2, ymm14 + vpsubw ymm8, ymm8, ymm0 + vpsubw ymm9, ymm9, ymm2 + vpmullw ymm0, ymm1, ymm12 + vpmullw ymm2, ymm3, ymm13 + vpmulhw ymm1, ymm1, ymm10 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm0, ymm0, ymm14 + vpmulhw ymm2, ymm2, ymm14 + vpsubw ymm1, ymm1, ymm0 + vpsubw ymm3, ymm3, ymm2 + ; 16: 2/2 + vperm2i128 ymm0, ymm8, ymm1, 32 + vmovdqu ymm10, YMMWORD PTR [r12+1600] + vperm2i128 ymm1, ymm8, ymm1, 49 + vmovdqu ymm12, YMMWORD PTR [r12+1632] + vperm2i128 ymm2, ymm9, ymm3, 32 + vmovdqu ymm11, YMMWORD PTR [r12+1664] + vperm2i128 ymm3, ymm9, ymm3, 49 + vmovdqu ymm13, YMMWORD PTR [r12+1696] + vpsubw ymm8, ymm0, ymm1 + vpsubw ymm9, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm1 + vpaddw ymm2, ymm2, ymm3 + vpmullw ymm1, ymm8, ymm12 + vpmullw ymm3, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm1, ymm1, ymm14 + vpmulhw ymm3, ymm3, ymm14 + vpsubw ymm1, ymm8, ymm1 + vpsubw ymm3, ymm9, ymm3 + ; 32: 2/2 + vmovdqu ymm10, YMMWORD PTR [r12+1728] + vmovdqu ymm12, YMMWORD PTR [r12+1760] + vpaddw ymm8, ymm0, ymm2 + vpaddw ymm9, ymm1, ymm3 + vpsubw ymm2, ymm0, ymm2 + vpsubw ymm3, ymm1, ymm3 + vpmulhw ymm0, ymm8, ymm15 + vpmulhw ymm1, ymm9, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm8, ymm0 + vpsubw ymm1, ymm9, ymm1 + vpmullw ymm8, ymm2, ymm12 + vpmullw ymm9, ymm3, ymm12 + vpmulhw ymm2, ymm2, ymm10 + vpmulhw ymm3, ymm3, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + ; 2: 2/2 + vperm2i128 ymm8, ymm4, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r12+1792] + vperm2i128 ymm9, ymm4, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r12+1824] + vpsllq ymm4, ymm9, 32 + vpsrlq ymm5, ymm8, 32 + vpblendd ymm4, ymm8, ymm4, 170 + vpblendd ymm5, ymm9, ymm5, 85 + vperm2i128 ymm8, ymm6, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r12+1856] + vperm2i128 ymm9, ymm6, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r12+1888] + vpsllq ymm6, ymm9, 32 + vpsrlq ymm7, ymm8, 32 + vpblendd ymm6, ymm8, ymm6, 170 + vpblendd ymm7, ymm9, ymm7, 85 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 4: 2/2 + vmovdqu ymm10, YMMWORD PTR [r12+1920] + vmovdqu ymm12, YMMWORD PTR [r12+1952] + vmovdqu ymm11, YMMWORD PTR [r12+1984] + vmovdqu ymm13, YMMWORD PTR [r12+2016] + vpunpckldq ymm4, ymm8, ymm5 + vpunpckhdq ymm5, ymm8, ymm5 + vpunpckldq ymm6, ymm9, ymm7 + vpunpckhdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 8: 2/2 + vmovdqu ymm10, YMMWORD PTR [r12+2048] + vmovdqu ymm12, YMMWORD PTR [r12+2080] + vmovdqu ymm11, YMMWORD PTR [r12+2112] + vmovdqu ymm13, YMMWORD PTR [r12+2144] + vpunpcklqdq ymm4, ymm8, ymm5 + vpunpckhqdq ymm5, ymm8, ymm5 + vpunpcklqdq ymm6, ymm9, ymm7 + vpunpckhqdq ymm7, ymm9, ymm7 + vpaddw ymm8, ymm4, ymm5 + vpaddw ymm9, ymm6, ymm7 + vpsubw ymm5, ymm4, ymm5 + vpsubw ymm7, ymm6, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm6, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm6, ymm6, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm6, ymm6, ymm14 + vpsubw ymm8, ymm8, ymm4 + vpsubw ymm9, ymm9, ymm6 + vpmullw ymm4, ymm5, ymm12 + vpmullw ymm6, ymm7, ymm13 + vpmulhw ymm5, ymm5, ymm10 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm6, ymm6, ymm14 + vpsubw ymm5, ymm5, ymm4 + vpsubw ymm7, ymm7, ymm6 + ; 16: 2/2 + vperm2i128 ymm4, ymm8, ymm5, 32 + vmovdqu ymm10, YMMWORD PTR [r12+2176] + vperm2i128 ymm5, ymm8, ymm5, 49 + vmovdqu ymm12, YMMWORD PTR [r12+2208] + vperm2i128 ymm6, ymm9, ymm7, 32 + vmovdqu ymm11, YMMWORD PTR [r12+2240] + vperm2i128 ymm7, ymm9, ymm7, 49 + vmovdqu ymm13, YMMWORD PTR [r12+2272] + vpsubw ymm8, ymm4, ymm5 + vpsubw ymm9, ymm6, ymm7 + vpaddw ymm4, ymm4, ymm5 + vpaddw ymm6, ymm6, ymm7 + vpmullw ymm5, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm13 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm11 + vpmulhw ymm5, ymm5, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm5, ymm8, ymm5 + vpsubw ymm7, ymm9, ymm7 + ; 32: 2/2 + vmovdqu ymm10, YMMWORD PTR [r12+2304] + vmovdqu ymm12, YMMWORD PTR [r12+2336] + vpaddw ymm8, ymm4, ymm6 + vpaddw ymm9, ymm5, ymm7 + vpsubw ymm6, ymm4, ymm6 + vpsubw ymm7, ymm5, ymm7 + vpmulhw ymm4, ymm8, ymm15 + vpmulhw ymm5, ymm9, ymm15 + vpsraw ymm4, ymm4, 10 + vpsraw ymm5, ymm5, 10 + vpmullw ymm4, ymm4, ymm14 + vpmullw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + ; 64: 2/2 + vmovdqu ymm10, YMMWORD PTR [r12+2368] + vmovdqu ymm12, YMMWORD PTR [r12+2400] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpsubw ymm8, ymm2, ymm6 + vpsubw ymm9, ymm3, ymm7 + vpaddw ymm2, ymm2, ymm6 + vpaddw ymm3, ymm3, ymm7 + vpmullw ymm6, ymm8, ymm12 + vpmullw ymm7, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm6, ymm6, ymm14 + vpmulhw ymm7, ymm7, ymm14 + vpsubw ymm6, ymm8, ymm6 + vpsubw ymm7, ymm9, ymm7 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu YMMWORD PTR [rdx+320], ymm2 + vmovdqu YMMWORD PTR [rdx+352], ymm3 + ; 128 + vmovdqu ymm10, YMMWORD PTR [r12+2432] + vmovdqu ymm12, YMMWORD PTR [r12+2464] + vmovdqu ymm11, YMMWORD PTR [r12+2496] + vmovdqu ymm13, YMMWORD PTR [r12+2528] + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm1, YMMWORD PTR [rdx+160] + vmovdqu ymm2, YMMWORD PTR [rdx+192] + vmovdqu ymm3, YMMWORD PTR [rdx+224] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpaddw ymm8, ymm2, ymm6 + vpaddw ymm9, ymm3, ymm7 + vpsubw ymm6, ymm2, ymm6 + vpsubw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm8, ymm15 + vpmulhw ymm3, ymm9, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm8, ymm2 + vpsubw ymm3, ymm9, ymm3 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm0, ymm0, ymm11 + vpmulhw ymm1, ymm1, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm0, ymm8 + vpsubw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm2, ymm13 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm2, ymm2, ymm11 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + vpmullw ymm8, ymm4, ymm13 + vpmullw ymm9, ymm5, ymm13 + vpmulhw ymm4, ymm4, ymm11 + vpmulhw ymm5, ymm5, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm4, ymm4, ymm8 + vpsubw ymm5, ymm5, ymm9 + vpmullw ymm8, ymm6, ymm13 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm6, ymm6, ymm11 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu YMMWORD PTR [rdx+192], ymm2 + vmovdqu YMMWORD PTR [rdx+224], ymm3 + vmovdqu YMMWORD PTR [rdx+384], ymm4 + vmovdqu YMMWORD PTR [rdx+416], ymm5 + vmovdqu YMMWORD PTR [rdx+448], ymm6 + vmovdqu YMMWORD PTR [rdx+480], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [rdx+256] + vmovdqu ymm5, YMMWORD PTR [rdx+288] + vmovdqu ymm6, YMMWORD PTR [rdx+320] + vmovdqu ymm7, YMMWORD PTR [rdx+352] + vpsubw ymm8, ymm0, ymm4 + vpsubw ymm9, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm5 + vpmullw ymm4, ymm8, ymm12 + vpmullw ymm5, ymm9, ymm12 + vpmulhw ymm8, ymm8, ymm10 + vpmulhw ymm9, ymm9, ymm10 + vpmulhw ymm4, ymm4, ymm14 + vpmulhw ymm5, ymm5, ymm14 + vpsubw ymm4, ymm8, ymm4 + vpsubw ymm5, ymm9, ymm5 + vpaddw ymm8, ymm2, ymm6 + vpaddw ymm9, ymm3, ymm7 + vpsubw ymm6, ymm2, ymm6 + vpsubw ymm7, ymm3, ymm7 + vpmulhw ymm2, ymm8, ymm15 + vpmulhw ymm3, ymm9, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm8, ymm2 + vpsubw ymm3, ymm9, ymm3 + vpmullw ymm8, ymm6, ymm12 + vpmullw ymm9, ymm7, ymm12 + vpmulhw ymm6, ymm6, ymm10 + vpmulhw ymm7, ymm7, ymm10 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vpmullw ymm8, ymm0, ymm13 + vpmullw ymm9, ymm1, ymm13 + vpmulhw ymm0, ymm0, ymm11 + vpmulhw ymm1, ymm1, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm0, ymm0, ymm8 + vpsubw ymm1, ymm1, ymm9 + vpmullw ymm8, ymm2, ymm13 + vpmullw ymm9, ymm3, ymm13 + vpmulhw ymm2, ymm2, ymm11 + vpmulhw ymm3, ymm3, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm2, ymm2, ymm8 + vpsubw ymm3, ymm3, ymm9 + vpmullw ymm8, ymm4, ymm13 + vpmullw ymm9, ymm5, ymm13 + vpmulhw ymm4, ymm4, ymm11 + vpmulhw ymm5, ymm5, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm4, ymm4, ymm8 + vpsubw ymm5, ymm5, ymm9 + vpmullw ymm8, ymm6, ymm13 + vpmullw ymm9, ymm7, ymm13 + vpmulhw ymm6, ymm6, ymm11 + vpmulhw ymm7, ymm7, ymm11 + vpmulhw ymm8, ymm8, ymm14 + vpmulhw ymm9, ymm9, ymm14 + vpsubw ymm6, ymm6, ymm8 + vpsubw ymm7, ymm7, ymm9 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vmovdqu YMMWORD PTR [rdx+256], ymm4 + vmovdqu YMMWORD PTR [rdx+288], ymm5 + vmovdqu YMMWORD PTR [rdx+320], ymm6 + vmovdqu YMMWORD PTR [rdx+352], ymm7 + ; Sub Errors + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [r9] + vmovdqu ymm5, YMMWORD PTR [r9+32] + vmovdqu ymm6, YMMWORD PTR [r9+64] + vmovdqu ymm7, YMMWORD PTR [r9+96] + vpsubw ymm4, ymm4, ymm0 + vpsubw ymm5, ymm5, ymm1 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpsubw ymm6, ymm6, ymm2 + vpsubw ymm7, ymm7, ymm3 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm1 + vmovdqu YMMWORD PTR [rdx+64], ymm2 + vmovdqu YMMWORD PTR [rdx+96], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm1, YMMWORD PTR [rdx+160] + vmovdqu ymm2, YMMWORD PTR [rdx+192] + vmovdqu ymm3, YMMWORD PTR [rdx+224] + vmovdqu ymm4, YMMWORD PTR [r9+128] + vmovdqu ymm5, YMMWORD PTR [r9+160] + vmovdqu ymm6, YMMWORD PTR [r9+192] + vmovdqu ymm7, YMMWORD PTR [r9+224] + vpsubw ymm4, ymm4, ymm0 + vpsubw ymm5, ymm5, ymm1 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpsubw ymm6, ymm6, ymm2 + vpsubw ymm7, ymm7, ymm3 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx+128], ymm0 + vmovdqu YMMWORD PTR [rdx+160], ymm1 + vmovdqu YMMWORD PTR [rdx+192], ymm2 + vmovdqu YMMWORD PTR [rdx+224], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+320] + vmovdqu ymm3, YMMWORD PTR [rdx+352] + vmovdqu ymm4, YMMWORD PTR [r9+256] + vmovdqu ymm5, YMMWORD PTR [r9+288] + vmovdqu ymm6, YMMWORD PTR [r9+320] + vmovdqu ymm7, YMMWORD PTR [r9+352] + vpsubw ymm4, ymm4, ymm0 + vpsubw ymm5, ymm5, ymm1 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpsubw ymm6, ymm6, ymm2 + vpsubw ymm7, ymm7, ymm3 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx+256], ymm0 + vmovdqu YMMWORD PTR [rdx+288], ymm1 + vmovdqu YMMWORD PTR [rdx+320], ymm2 + vmovdqu YMMWORD PTR [rdx+352], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm1, YMMWORD PTR [rdx+416] + vmovdqu ymm2, YMMWORD PTR [rdx+448] + vmovdqu ymm3, YMMWORD PTR [rdx+480] + vmovdqu ymm4, YMMWORD PTR [r9+384] + vmovdqu ymm5, YMMWORD PTR [r9+416] + vmovdqu ymm6, YMMWORD PTR [r9+448] + vmovdqu ymm7, YMMWORD PTR [r9+480] + vpsubw ymm4, ymm4, ymm0 + vpsubw ymm5, ymm5, ymm1 + vpmulhw ymm0, ymm4, ymm15 + vpmulhw ymm1, ymm5, ymm15 + vpsraw ymm0, ymm0, 10 + vpsraw ymm1, ymm1, 10 + vpmullw ymm0, ymm0, ymm14 + vpmullw ymm1, ymm1, ymm14 + vpsubw ymm0, ymm4, ymm0 + vpsubw ymm1, ymm5, ymm1 + vpsubw ymm6, ymm6, ymm2 + vpsubw ymm7, ymm7, ymm3 + vpmulhw ymm2, ymm6, ymm15 + vpmulhw ymm3, ymm7, ymm15 + vpsraw ymm2, ymm2, 10 + vpsraw ymm3, ymm3, 10 + vpmullw ymm2, ymm2, ymm14 + vpmullw ymm3, ymm3, ymm14 + vpsubw ymm2, ymm6, ymm2 + vpsubw ymm3, ymm7, ymm3 + vmovdqu YMMWORD PTR [rdx+384], ymm0 + vmovdqu YMMWORD PTR [rdx+416], ymm1 + vmovdqu YMMWORD PTR [rdx+448], ymm2 + vmovdqu YMMWORD PTR [rdx+480], ymm3 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r12 + ret +mlkem_decapsulate_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_csubq_avx2 PROC + sub rsp, 112 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu ymm12, YMMWORD PTR mlkem_q + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpsubw ymm8, ymm0, ymm12 + vpsubw ymm9, ymm1, ymm12 + vpsubw ymm10, ymm2, ymm12 + vpsubw ymm11, ymm3, ymm12 + vpsraw ymm0, ymm8, 15 + vpsraw ymm1, ymm9, 15 + vpsraw ymm2, ymm10, 15 + vpsraw ymm3, ymm11, 15 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm10 + vpaddw ymm3, ymm3, ymm11 + vpsubw ymm8, ymm4, ymm12 + vpsubw ymm9, ymm5, ymm12 + vpsubw ymm10, ymm6, ymm12 + vpsubw ymm11, ymm7, ymm12 + vpsraw ymm4, ymm8, 15 + vpsraw ymm5, ymm9, 15 + vpsraw ymm6, ymm10, 15 + vpsraw ymm7, ymm11, 15 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vpand ymm6, ymm6, ymm12 + vpand ymm7, ymm7, ymm12 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + vpaddw ymm6, ymm6, ymm10 + vpaddw ymm7, ymm7, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vpsubw ymm8, ymm0, ymm12 + vpsubw ymm9, ymm1, ymm12 + vpsubw ymm10, ymm2, ymm12 + vpsubw ymm11, ymm3, ymm12 + vpsraw ymm0, ymm8, 15 + vpsraw ymm1, ymm9, 15 + vpsraw ymm2, ymm10, 15 + vpsraw ymm3, ymm11, 15 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm10 + vpaddw ymm3, ymm3, ymm11 + vpsubw ymm8, ymm4, ymm12 + vpsubw ymm9, ymm5, ymm12 + vpsubw ymm10, ymm6, ymm12 + vpsubw ymm11, ymm7, ymm12 + vpsraw ymm4, ymm8, 15 + vpsraw ymm5, ymm9, 15 + vpsraw ymm6, ymm10, 15 + vpsraw ymm7, ymm11, 15 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vpand ymm6, ymm6, ymm12 + vpand ymm7, ymm7, ymm12 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + vpaddw ymm6, ymm6, ymm10 + vpaddw ymm7, ymm7, ymm11 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + add rsp, 112 + ret +mlkem_csubq_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_rej_idx QWORD 0ffffffffffffffffh, 0ffffffffffffff00h + QWORD 0ffffffffffffff02h, 0ffffffffffff0200h + QWORD 0ffffffffffffff04h, 0ffffffffffff0400h + QWORD 0ffffffffffff0402h, 0ffffffffff040200h + QWORD 0ffffffffffffff06h, 0ffffffffffff0600h + QWORD 0ffffffffffff0602h, 0ffffffffff060200h + QWORD 0ffffffffffff0604h, 0ffffffffff060400h + QWORD 0ffffffffff060402h, 0ffffffff06040200h + QWORD 0ffffffffffffff08h, 0ffffffffffff0800h + QWORD 0ffffffffffff0802h, 0ffffffffff080200h + QWORD 0ffffffffffff0804h, 0ffffffffff080400h + QWORD 0ffffffffff080402h, 0ffffffff08040200h + QWORD 0ffffffffffff0806h, 0ffffffffff080600h + QWORD 0ffffffffff080602h, 0ffffffff08060200h + QWORD 0ffffffffff080604h, 0ffffffff08060400h + QWORD 0ffffffff08060402h, 0ffffff0806040200h + QWORD 0ffffffffffffff0ah, 0ffffffffffff0a00h + QWORD 0ffffffffffff0a02h, 0ffffffffff0a0200h + QWORD 0ffffffffffff0a04h, 0ffffffffff0a0400h + QWORD 0ffffffffff0a0402h, 0ffffffff0a040200h + QWORD 0ffffffffffff0a06h, 0ffffffffff0a0600h + QWORD 0ffffffffff0a0602h, 0ffffffff0a060200h + QWORD 0ffffffffff0a0604h, 0ffffffff0a060400h + QWORD 0ffffffff0a060402h, 0ffffff0a06040200h + QWORD 0ffffffffffff0a08h, 0ffffffffff0a0800h + QWORD 0ffffffffff0a0802h, 0ffffffff0a080200h + QWORD 0ffffffffff0a0804h, 0ffffffff0a080400h + QWORD 0ffffffff0a080402h, 0ffffff0a08040200h + QWORD 0ffffffffff0a0806h, 0ffffffff0a080600h + QWORD 0ffffffff0a080602h, 0ffffff0a08060200h + QWORD 0ffffffff0a080604h, 0ffffff0a08060400h + QWORD 0ffffff0a08060402h, 0ffff0a0806040200h + QWORD 0ffffffffffffff0ch, 0ffffffffffff0c00h + QWORD 0ffffffffffff0c02h, 0ffffffffff0c0200h + QWORD 0ffffffffffff0c04h, 0ffffffffff0c0400h + QWORD 0ffffffffff0c0402h, 0ffffffff0c040200h + QWORD 0ffffffffffff0c06h, 0ffffffffff0c0600h + QWORD 0ffffffffff0c0602h, 0ffffffff0c060200h + QWORD 0ffffffffff0c0604h, 0ffffffff0c060400h + QWORD 0ffffffff0c060402h, 0ffffff0c06040200h + QWORD 0ffffffffffff0c08h, 0ffffffffff0c0800h + QWORD 0ffffffffff0c0802h, 0ffffffff0c080200h + QWORD 0ffffffffff0c0804h, 0ffffffff0c080400h + QWORD 0ffffffff0c080402h, 0ffffff0c08040200h + QWORD 0ffffffffff0c0806h, 0ffffffff0c080600h + QWORD 0ffffffff0c080602h, 0ffffff0c08060200h + QWORD 0ffffffff0c080604h, 0ffffff0c08060400h + QWORD 0ffffff0c08060402h, 0ffff0c0806040200h + QWORD 0ffffffffffff0c0ah, 0ffffffffff0c0a00h + QWORD 0ffffffffff0c0a02h, 0ffffffff0c0a0200h + QWORD 0ffffffffff0c0a04h, 0ffffffff0c0a0400h + QWORD 0ffffffff0c0a0402h, 0ffffff0c0a040200h + QWORD 0ffffffffff0c0a06h, 0ffffffff0c0a0600h + QWORD 0ffffffff0c0a0602h, 0ffffff0c0a060200h + QWORD 0ffffffff0c0a0604h, 0ffffff0c0a060400h + QWORD 0ffffff0c0a060402h, 0ffff0c0a06040200h + QWORD 0ffffffffff0c0a08h, 0ffffffff0c0a0800h + QWORD 0ffffffff0c0a0802h, 0ffffff0c0a080200h + QWORD 0ffffffff0c0a0804h, 0ffffff0c0a080400h + QWORD 0ffffff0c0a080402h, 0ffff0c0a08040200h + QWORD 0ffffffff0c0a0806h, 0ffffff0c0a080600h + QWORD 0ffffff0c0a080602h, 0ffff0c0a08060200h + QWORD 0ffffff0c0a080604h, 0ffff0c0a08060400h + QWORD 0ffff0c0a08060402h, 0ff0c0a0806040200h + QWORD 0ffffffffffffff0eh, 0ffffffffffff0e00h + QWORD 0ffffffffffff0e02h, 0ffffffffff0e0200h + QWORD 0ffffffffffff0e04h, 0ffffffffff0e0400h + QWORD 0ffffffffff0e0402h, 0ffffffff0e040200h + QWORD 0ffffffffffff0e06h, 0ffffffffff0e0600h + QWORD 0ffffffffff0e0602h, 0ffffffff0e060200h + QWORD 0ffffffffff0e0604h, 0ffffffff0e060400h + QWORD 0ffffffff0e060402h, 0ffffff0e06040200h + QWORD 0ffffffffffff0e08h, 0ffffffffff0e0800h + QWORD 0ffffffffff0e0802h, 0ffffffff0e080200h + QWORD 0ffffffffff0e0804h, 0ffffffff0e080400h + QWORD 0ffffffff0e080402h, 0ffffff0e08040200h + QWORD 0ffffffffff0e0806h, 0ffffffff0e080600h + QWORD 0ffffffff0e080602h, 0ffffff0e08060200h + QWORD 0ffffffff0e080604h, 0ffffff0e08060400h + QWORD 0ffffff0e08060402h, 0ffff0e0806040200h + QWORD 0ffffffffffff0e0ah, 0ffffffffff0e0a00h + QWORD 0ffffffffff0e0a02h, 0ffffffff0e0a0200h + QWORD 0ffffffffff0e0a04h, 0ffffffff0e0a0400h + QWORD 0ffffffff0e0a0402h, 0ffffff0e0a040200h + QWORD 0ffffffffff0e0a06h, 0ffffffff0e0a0600h + QWORD 0ffffffff0e0a0602h, 0ffffff0e0a060200h + QWORD 0ffffffff0e0a0604h, 0ffffff0e0a060400h + QWORD 0ffffff0e0a060402h, 0ffff0e0a06040200h + QWORD 0ffffffffff0e0a08h, 0ffffffff0e0a0800h + QWORD 0ffffffff0e0a0802h, 0ffffff0e0a080200h + QWORD 0ffffffff0e0a0804h, 0ffffff0e0a080400h + QWORD 0ffffff0e0a080402h, 0ffff0e0a08040200h + QWORD 0ffffffff0e0a0806h, 0ffffff0e0a080600h + QWORD 0ffffff0e0a080602h, 0ffff0e0a08060200h + QWORD 0ffffff0e0a080604h, 0ffff0e0a08060400h + QWORD 0ffff0e0a08060402h, 0ff0e0a0806040200h + QWORD 0ffffffffffff0e0ch, 0ffffffffff0e0c00h + QWORD 0ffffffffff0e0c02h, 0ffffffff0e0c0200h + QWORD 0ffffffffff0e0c04h, 0ffffffff0e0c0400h + QWORD 0ffffffff0e0c0402h, 0ffffff0e0c040200h + QWORD 0ffffffffff0e0c06h, 0ffffffff0e0c0600h + QWORD 0ffffffff0e0c0602h, 0ffffff0e0c060200h + QWORD 0ffffffff0e0c0604h, 0ffffff0e0c060400h + QWORD 0ffffff0e0c060402h, 0ffff0e0c06040200h + QWORD 0ffffffffff0e0c08h, 0ffffffff0e0c0800h + QWORD 0ffffffff0e0c0802h, 0ffffff0e0c080200h + QWORD 0ffffffff0e0c0804h, 0ffffff0e0c080400h + QWORD 0ffffff0e0c080402h, 0ffff0e0c08040200h + QWORD 0ffffffff0e0c0806h, 0ffffff0e0c080600h + QWORD 0ffffff0e0c080602h, 0ffff0e0c08060200h + QWORD 0ffffff0e0c080604h, 0ffff0e0c08060400h + QWORD 0ffff0e0c08060402h, 0ff0e0c0806040200h + QWORD 0ffffffffff0e0c0ah, 0ffffffff0e0c0a00h + QWORD 0ffffffff0e0c0a02h, 0ffffff0e0c0a0200h + QWORD 0ffffffff0e0c0a04h, 0ffffff0e0c0a0400h + QWORD 0ffffff0e0c0a0402h, 0ffff0e0c0a040200h + QWORD 0ffffffff0e0c0a06h, 0ffffff0e0c0a0600h + QWORD 0ffffff0e0c0a0602h, 0ffff0e0c0a060200h + QWORD 0ffffff0e0c0a0604h, 0ffff0e0c0a060400h + QWORD 0ffff0e0c0a060402h, 0ff0e0c0a06040200h + QWORD 0ffffffff0e0c0a08h, 0ffffff0e0c0a0800h + QWORD 0ffffff0e0c0a0802h, 0ffff0e0c0a080200h + QWORD 0ffffff0e0c0a0804h, 0ffff0e0c0a080400h + QWORD 0ffff0e0c0a080402h, 0ff0e0c0a08040200h + QWORD 0ffffff0e0c0a0806h, 0ffff0e0c0a080600h + QWORD 0ffff0e0c0a080602h, 0ff0e0c0a08060200h + QWORD 0ffff0e0c0a080604h, 0ff0e0c0a08060400h + QWORD 0ff0e0c0a08060402h, 0e0c0a0806040200h +ptr_L_mlkem_rej_idx QWORD L_mlkem_rej_idx +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_rej_q QWORD 0d010d010d010d01h, 0d010d010d010d01h + QWORD 0d010d010d010d01h, 0d010d010d010d01h +ptr_L_mlkem_rej_q QWORD L_mlkem_rej_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_rej_ones QWORD 0101010101010101h, 0101010101010101h + QWORD 0101010101010101h, 0101010101010101h +ptr_L_mlkem_rej_ones QWORD L_mlkem_rej_ones +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_rej_mask QWORD 0fff0fff0fff0fffh, 0fff0fff0fff0fffh + QWORD 0fff0fff0fff0fffh, 0fff0fff0fff0fffh +ptr_L_mlkem_rej_mask QWORD L_mlkem_rej_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_rej_shuffle QWORD 0504040302010100h, 0b0a0a0908070706h + QWORD 0908080706050504h, 0f0e0e0d0c0b0b0ah +ptr_L_mlkem_rej_shuffle QWORD L_mlkem_rej_shuffle +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_rej_uniform_n_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov r10, rcx + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + mov eax, edx + vmovdqu ymm6, YMMWORD PTR L_mlkem_rej_q + vmovdqu ymm7, YMMWORD PTR L_mlkem_rej_ones + vmovdqu ymm8, YMMWORD PTR L_mlkem_rej_mask + vmovdqu ymm9, YMMWORD PTR L_mlkem_rej_shuffle + mov r11, QWORD PTR [ptr_L_mlkem_rej_idx] + mov rdi, 1229782938247303441 + mov rbp, 1012195045828461056 + mov r15, 72340172838076673 + vpermq ymm0, [r8], 148 + vpermq ymm1, [r8+24], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + vpermq ymm0, [r8+48], 148 + vpermq ymm1, [r8+72], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + vpermq ymm0, [r8+96], 148 + vpermq ymm1, [r8+120], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + vpermq ymm0, [r8+144], 148 + vpermq ymm1, [r8+168], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + vpermq ymm0, [r8+192], 148 + vpermq ymm1, [r8+216], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + vpermq ymm0, [r8+240], 148 + vpermq ymm1, [r8+264], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + vpermq ymm0, [r8+288], 148 + vpermq ymm1, [r8+312], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + add r8, 336 + sub r9d, 336 +L_mlkem_rej_uniform_n_avx2_start_256: + vpermq ymm0, [r8], 148 + vpermq ymm1, [r8+24], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + add r8, 48 + sub r9d, 48 + cmp r9d, 48 + jl L_mlkem_rej_uniform_n_avx2_done_256 + cmp edx, 32 + jge L_mlkem_rej_uniform_n_avx2_start_256 +L_mlkem_rej_uniform_n_avx2_done_256: + cmp edx, 8 + jl L_mlkem_rej_uniform_n_avx2_done_128 + cmp r9d, 12 + jl L_mlkem_rej_uniform_n_avx2_done_128 +L_mlkem_rej_uniform_n_avx2_start_128: + vmovdqu xmm0, OWORD PTR [r8] + vpshufb xmm0, xmm0, xmm9 + vpsrlw xmm2, xmm0, 4 + vpblendw xmm0, xmm0, xmm2, 170 + vpand xmm0, xmm0, xmm8 + vpcmpgtw xmm2, xmm6, xmm0 + vpmovmskb rbx, xmm2 + mov r12, 21845 + pext ebx, ebx, r12d + movq xmm3, QWORD PTR [r11+8*rbx] + vpaddb xmm4, xmm3, xmm7 + vpunpcklbw xmm3, xmm3, xmm4 + vpshufb xmm0, xmm0, xmm3 + vmovdqu OWORD PTR [r10], xmm0 + popcnt ecx, ebx + lea r10, QWORD PTR [r10+2*rcx] + sub edx, ecx + add r8, 12 + sub r9d, 12 + cmp r9d, 12 + jl L_mlkem_rej_uniform_n_avx2_done_128 + cmp edx, 8 + jge L_mlkem_rej_uniform_n_avx2_start_128 +L_mlkem_rej_uniform_n_avx2_done_128: + cmp r9d, 0 + je L_mlkem_rej_uniform_n_avx2_done_64 + cmp edx, 0 + je L_mlkem_rej_uniform_n_avx2_done_64 + mov rsi, 1152657617789587455 + mov r12, 2305878194122661888 + mov r13, 937044495634074881 + mov r14, 1152939097061330944 +L_mlkem_rej_uniform_n_avx2_start_64: + mov rcx, QWORD PTR [r8] + pdep rcx, rcx, rsi + cmp cx, 3329 + jge L_mlkem_rej_uniform_0_avx2_rej_large_0 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_n_avx2_done_64 +L_mlkem_rej_uniform_0_avx2_rej_large_0: + shr rcx, 16 + cmp cx, 3329 + jge L_mlkem_rej_uniform_0_avx2_rej_large_1 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_n_avx2_done_64 +L_mlkem_rej_uniform_0_avx2_rej_large_1: + shr rcx, 16 + cmp cx, 3329 + jge L_mlkem_rej_uniform_0_avx2_rej_large_2 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_n_avx2_done_64 +L_mlkem_rej_uniform_0_avx2_rej_large_2: + shr rcx, 16 + cmp cx, 3329 + jge L_mlkem_rej_uniform_0_avx2_rej_large_3 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_n_avx2_done_64 +L_mlkem_rej_uniform_0_avx2_rej_large_3: + add r8, 6 + sub r9d, 6 + jle L_mlkem_rej_uniform_n_avx2_done_64 + cmp edx, 0 + jg L_mlkem_rej_uniform_n_avx2_start_64 +L_mlkem_rej_uniform_n_avx2_done_64: + vzeroupper + sub eax, edx + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +mlkem_rej_uniform_n_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_rej_uniform_avx2 PROC + push rbx + push r12 + push r13 + push r14 + push r15 + push rdi + push rsi + push rbp + mov r10, rcx + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + mov eax, edx + cmp edx, 0 + je L_mlkem_rej_uniform_avx2_done_64 + cmp edx, 8 + jl L_mlkem_rej_uniform_avx2_done_128 + vmovdqu ymm6, YMMWORD PTR L_mlkem_rej_q + vmovdqu ymm7, YMMWORD PTR L_mlkem_rej_ones + vmovdqu ymm8, YMMWORD PTR L_mlkem_rej_mask + vmovdqu ymm9, YMMWORD PTR L_mlkem_rej_shuffle + mov r11, QWORD PTR [ptr_L_mlkem_rej_idx] + mov rdi, 1229782938247303441 + mov rbp, 1012195045828461056 + mov r15, 72340172838076673 + cmp edx, 32 + jl L_mlkem_rej_uniform_avx2_done_256 + vpermq ymm0, [r8], 148 + vpermq ymm1, [r8+24], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + add r8, 48 + sub r9d, 48 + cmp edx, 32 + jl L_mlkem_rej_uniform_avx2_done_256 + vpermq ymm0, [r8], 148 + vpermq ymm1, [r8+24], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + add r8, 48 + sub r9d, 48 + cmp edx, 32 + jl L_mlkem_rej_uniform_avx2_done_256 +L_mlkem_rej_uniform_avx2_start_256: + vpermq ymm0, [r8], 148 + vpermq ymm1, [r8+24], 148 + vpshufb ymm0, ymm0, ymm9 + vpshufb ymm1, ymm1, ymm9 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpblendw ymm0, ymm0, ymm2, 170 + vpblendw ymm1, ymm1, ymm3, 170 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpcmpgtw ymm2, ymm6, ymm0 + vpcmpgtw ymm3, ymm6, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpmovmskb rbx, ymm2 + movzx r12d, bl + movzx ecx, bh + mov r13, rbx + mov r14, rbx + shr r13, 16 + shr r14, 24 + and r13, 255 + and r14, 255 + movq xmm2, QWORD PTR [r11+8*r12] + movq xmm3, QWORD PTR [r11+8*rcx] + movq xmm4, QWORD PTR [r11+8*r13] + movq xmm5, QWORD PTR [r11+8*r14] + vinserti128 ymm2, ymm2, xmm4, 1 + vinserti128 ymm3, ymm3, xmm5, 1 + vpaddb ymm4, ymm2, ymm7 + vpaddb ymm5, ymm3, ymm7 + vpunpcklbw ymm2, ymm2, ymm4 + vpunpcklbw ymm3, ymm3, ymm5 + vpshufb ymm0, ymm0, ymm2 + vpshufb ymm1, ymm1, ymm3 + mov r12, rbx + mov r13, rbx + mov r14, rbx + and rbx, 255 + shr r12, 16 + shr r13, 8 + shr r14, 24 + and r12, 255 + and r13, 255 + popcnt ebx, ebx + popcnt r12d, r12d + popcnt r13d, r13d + popcnt r14d, r14d + vmovdqu OWORD PTR [r10], xmm0 + vextracti128 xmm0, ymm0, 1 + lea r10, QWORD PTR [r10+2*rbx] + sub edx, ebx + vmovdqu OWORD PTR [r10], xmm0 + lea r10, QWORD PTR [r10+2*r12] + sub edx, r12d + vmovdqu OWORD PTR [r10], xmm1 + vextracti128 xmm1, ymm1, 1 + lea r10, QWORD PTR [r10+2*r13] + sub edx, r13d + vmovdqu OWORD PTR [r10], xmm1 + lea r10, QWORD PTR [r10+2*r14] + sub edx, r14d + add r8, 48 + sub r9d, 48 + cmp r9d, 48 + jl L_mlkem_rej_uniform_avx2_done_256 + cmp edx, 32 + jge L_mlkem_rej_uniform_avx2_start_256 +L_mlkem_rej_uniform_avx2_done_256: + cmp edx, 8 + jl L_mlkem_rej_uniform_avx2_done_128 + cmp r9d, 12 + jl L_mlkem_rej_uniform_avx2_done_128 +L_mlkem_rej_uniform_avx2_start_128: + vmovdqu xmm0, OWORD PTR [r8] + vpshufb xmm0, xmm0, xmm9 + vpsrlw xmm2, xmm0, 4 + vpblendw xmm0, xmm0, xmm2, 170 + vpand xmm0, xmm0, xmm8 + vpcmpgtw xmm2, xmm6, xmm0 + vpmovmskb rbx, xmm2 + mov r12, 21845 + pext ebx, ebx, r12d + movq xmm3, QWORD PTR [r11+8*rbx] + vpaddb xmm4, xmm3, xmm7 + vpunpcklbw xmm3, xmm3, xmm4 + vpshufb xmm0, xmm0, xmm3 + vmovdqu OWORD PTR [r10], xmm0 + popcnt ecx, ebx + lea r10, QWORD PTR [r10+2*rcx] + sub edx, ecx + add r8, 12 + sub r9d, 12 + cmp r9d, 12 + jl L_mlkem_rej_uniform_avx2_done_128 + cmp edx, 8 + jge L_mlkem_rej_uniform_avx2_start_128 +L_mlkem_rej_uniform_avx2_done_128: + cmp r9d, 0 + je L_mlkem_rej_uniform_avx2_done_64 + cmp edx, 0 + je L_mlkem_rej_uniform_avx2_done_64 + mov rsi, 1152657617789587455 + mov r12, 2305878194122661888 + mov r13, 937044495634074881 + mov r14, 1152939097061330944 +L_mlkem_rej_uniform_avx2_start_64: + mov rcx, QWORD PTR [r8] + pdep rcx, rcx, rsi + cmp cx, 3329 + jge L_mlkem_rej_uniform_avx2_rej_large_0 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_avx2_done_64 +L_mlkem_rej_uniform_avx2_rej_large_0: + shr rcx, 16 + cmp cx, 3329 + jge L_mlkem_rej_uniform_avx2_rej_large_1 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_avx2_done_64 +L_mlkem_rej_uniform_avx2_rej_large_1: + shr rcx, 16 + cmp cx, 3329 + jge L_mlkem_rej_uniform_avx2_rej_large_2 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_avx2_done_64 +L_mlkem_rej_uniform_avx2_rej_large_2: + shr rcx, 16 + cmp cx, 3329 + jge L_mlkem_rej_uniform_avx2_rej_large_3 + mov WORD PTR [r10], cx + add r10, 2 + sub edx, 1 + je L_mlkem_rej_uniform_avx2_done_64 +L_mlkem_rej_uniform_avx2_rej_large_3: + add r8, 6 + sub r9d, 6 + jle L_mlkem_rej_uniform_avx2_done_64 + cmp edx, 0 + jg L_mlkem_rej_uniform_avx2_start_64 +L_mlkem_rej_uniform_avx2_done_64: + vzeroupper + sub eax, edx + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + pop rbp + pop rsi + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret +mlkem_rej_uniform_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_249 QWORD 0024924900249249h, 0024924900249249h + QWORD 0024924900249249h, 0024924900249249h +ptr_L_mlkem_mask_249 QWORD L_mlkem_mask_249 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_6db QWORD 006db6db006db6dbh, 006db6db006db6dbh + QWORD 006db6db006db6dbh, 006db6db006db6dbh +ptr_L_mlkem_mask_6db QWORD L_mlkem_mask_6db +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_07 QWORD 0000000700000007h, 0000000700000007h + QWORD 0000000700000007h, 0000000700000007h +ptr_L_mlkem_mask_07 QWORD L_mlkem_mask_07 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_70 QWORD 0007000000070000h, 0007000000070000h + QWORD 0007000000070000h, 0007000000070000h +ptr_L_mlkem_mask_70 QWORD L_mlkem_mask_70 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_3 QWORD 0003000300030003h, 0003000300030003h + QWORD 0003000300030003h, 0003000300030003h +ptr_L_mlkem_mask_3 QWORD L_mlkem_mask_3 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_shuff QWORD 0ff050403ff020100h, 0ff0b0a09ff080706h + QWORD 0ff090807ff060504h, 0ff0f0e0dff0c0b0ah +ptr_L_mlkem_shuff QWORD L_mlkem_shuff +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_cbd_eta3_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu ymm8, YMMWORD PTR L_mlkem_mask_249 + vmovdqu ymm9, YMMWORD PTR L_mlkem_mask_6db + vmovdqu ymm10, YMMWORD PTR L_mlkem_mask_07 + vmovdqu ymm11, YMMWORD PTR L_mlkem_mask_70 + vmovdqu ymm12, YMMWORD PTR L_mlkem_mask_3 + vmovdqu ymm13, YMMWORD PTR L_mlkem_shuff + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+24] + vpermq ymm0, ymm0, 148 + vpermq ymm1, ymm1, 148 + vpshufb ymm0, ymm0, ymm13 + vpshufb ymm1, ymm1, ymm13 + vpsrld ymm2, ymm0, 1 + vpsrld ymm3, ymm1, 1 + vpsrld ymm4, ymm0, 2 + vpsrld ymm5, ymm1, 2 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpand ymm4, ymm4, ymm8 + vpand ymm5, ymm5, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpsrld ymm2, ymm0, 3 + vpsrld ymm3, ymm1, 3 + vpaddd ymm0, ymm0, ymm9 + vpaddd ymm1, ymm1, ymm9 + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm3 + vpslld ymm2, ymm0, 10 + vpslld ymm3, ymm1, 10 + vpsrld ymm4, ymm0, 12 + vpsrld ymm5, ymm1, 12 + vpsrld ymm6, ymm0, 2 + vpsrld ymm7, ymm1, 2 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm11 + vpand ymm7, ymm7, ymm11 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm2, ymm4, ymm6 + vpaddw ymm3, ymm5, ymm7 + vpsubw ymm0, ymm0, ymm12 + vpsubw ymm1, ymm1, ymm12 + vpsubw ymm2, ymm2, ymm12 + vpsubw ymm3, ymm3, ymm12 + vpunpckldq ymm4, ymm0, ymm2 + vpunpckldq ymm5, ymm1, ymm3 + vpunpckhdq ymm6, ymm0, ymm2 + vpunpckhdq ymm7, ymm1, ymm3 + vperm2i128 ymm0, ymm4, ymm6, 32 + vperm2i128 ymm1, ymm5, ymm7, 32 + vperm2i128 ymm2, ymm4, ymm6, 49 + vperm2i128 ymm3, ymm5, ymm7, 49 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+48] + vmovdqu ymm1, YMMWORD PTR [rdx+72] + vpermq ymm0, ymm0, 148 + vpermq ymm1, ymm1, 148 + vpshufb ymm0, ymm0, ymm13 + vpshufb ymm1, ymm1, ymm13 + vpsrld ymm2, ymm0, 1 + vpsrld ymm3, ymm1, 1 + vpsrld ymm4, ymm0, 2 + vpsrld ymm5, ymm1, 2 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpand ymm4, ymm4, ymm8 + vpand ymm5, ymm5, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpsrld ymm2, ymm0, 3 + vpsrld ymm3, ymm1, 3 + vpaddd ymm0, ymm0, ymm9 + vpaddd ymm1, ymm1, ymm9 + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm3 + vpslld ymm2, ymm0, 10 + vpslld ymm3, ymm1, 10 + vpsrld ymm4, ymm0, 12 + vpsrld ymm5, ymm1, 12 + vpsrld ymm6, ymm0, 2 + vpsrld ymm7, ymm1, 2 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm11 + vpand ymm7, ymm7, ymm11 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm2, ymm4, ymm6 + vpaddw ymm3, ymm5, ymm7 + vpsubw ymm0, ymm0, ymm12 + vpsubw ymm1, ymm1, ymm12 + vpsubw ymm2, ymm2, ymm12 + vpsubw ymm3, ymm3, ymm12 + vpunpckldq ymm4, ymm0, ymm2 + vpunpckldq ymm5, ymm1, ymm3 + vpunpckhdq ymm6, ymm0, ymm2 + vpunpckhdq ymm7, ymm1, ymm3 + vperm2i128 ymm0, ymm4, ymm6, 32 + vperm2i128 ymm1, ymm5, ymm7, 32 + vperm2i128 ymm2, ymm4, ymm6, 49 + vperm2i128 ymm3, ymm5, ymm7, 49 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm2 + vmovdqu YMMWORD PTR [rcx+192], ymm1 + vmovdqu YMMWORD PTR [rcx+224], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+96] + vmovdqu ymm1, YMMWORD PTR [rdx+120] + vpermq ymm0, ymm0, 148 + vpermq ymm1, ymm1, 148 + vpshufb ymm0, ymm0, ymm13 + vpshufb ymm1, ymm1, ymm13 + vpsrld ymm2, ymm0, 1 + vpsrld ymm3, ymm1, 1 + vpsrld ymm4, ymm0, 2 + vpsrld ymm5, ymm1, 2 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpand ymm4, ymm4, ymm8 + vpand ymm5, ymm5, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpsrld ymm2, ymm0, 3 + vpsrld ymm3, ymm1, 3 + vpaddd ymm0, ymm0, ymm9 + vpaddd ymm1, ymm1, ymm9 + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm3 + vpslld ymm2, ymm0, 10 + vpslld ymm3, ymm1, 10 + vpsrld ymm4, ymm0, 12 + vpsrld ymm5, ymm1, 12 + vpsrld ymm6, ymm0, 2 + vpsrld ymm7, ymm1, 2 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm11 + vpand ymm7, ymm7, ymm11 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm2, ymm4, ymm6 + vpaddw ymm3, ymm5, ymm7 + vpsubw ymm0, ymm0, ymm12 + vpsubw ymm1, ymm1, ymm12 + vpsubw ymm2, ymm2, ymm12 + vpsubw ymm3, ymm3, ymm12 + vpunpckldq ymm4, ymm0, ymm2 + vpunpckldq ymm5, ymm1, ymm3 + vpunpckhdq ymm6, ymm0, ymm2 + vpunpckhdq ymm7, ymm1, ymm3 + vperm2i128 ymm0, ymm4, ymm6, 32 + vperm2i128 ymm1, ymm5, ymm7, 32 + vperm2i128 ymm2, ymm4, ymm6, 49 + vperm2i128 ymm3, ymm5, ymm7, 49 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm2 + vmovdqu YMMWORD PTR [rcx+320], ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu ymm0, YMMWORD PTR [rdx+144] + vmovdqu ymm1, YMMWORD PTR [rdx+168] + vpermq ymm0, ymm0, 148 + vpermq ymm1, ymm1, 148 + vpshufb ymm0, ymm0, ymm13 + vpshufb ymm1, ymm1, ymm13 + vpsrld ymm2, ymm0, 1 + vpsrld ymm3, ymm1, 1 + vpsrld ymm4, ymm0, 2 + vpsrld ymm5, ymm1, 2 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpand ymm4, ymm4, ymm8 + vpand ymm5, ymm5, ymm8 + vpaddd ymm0, ymm0, ymm2 + vpaddd ymm1, ymm1, ymm3 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpsrld ymm2, ymm0, 3 + vpsrld ymm3, ymm1, 3 + vpaddd ymm0, ymm0, ymm9 + vpaddd ymm1, ymm1, ymm9 + vpsubd ymm0, ymm0, ymm2 + vpsubd ymm1, ymm1, ymm3 + vpslld ymm2, ymm0, 10 + vpslld ymm3, ymm1, 10 + vpsrld ymm4, ymm0, 12 + vpsrld ymm5, ymm1, 12 + vpsrld ymm6, ymm0, 2 + vpsrld ymm7, ymm1, 2 + vpand ymm0, ymm0, ymm10 + vpand ymm1, ymm1, ymm10 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm10 + vpand ymm5, ymm5, ymm10 + vpand ymm6, ymm6, ymm11 + vpand ymm7, ymm7, ymm11 + vpaddw ymm0, ymm0, ymm2 + vpaddw ymm1, ymm1, ymm3 + vpaddw ymm2, ymm4, ymm6 + vpaddw ymm3, ymm5, ymm7 + vpsubw ymm0, ymm0, ymm12 + vpsubw ymm1, ymm1, ymm12 + vpsubw ymm2, ymm2, ymm12 + vpsubw ymm3, ymm3, ymm12 + vpunpckldq ymm4, ymm0, ymm2 + vpunpckldq ymm5, ymm1, ymm3 + vpunpckhdq ymm6, ymm0, ymm2 + vpunpckhdq ymm7, ymm1, ymm3 + vperm2i128 ymm0, ymm4, ymm6, 32 + vperm2i128 ymm1, ymm5, ymm7, 32 + vperm2i128 ymm2, ymm4, ymm6, 49 + vperm2i128 ymm3, ymm5, ymm7, 49 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + vmovdqu YMMWORD PTR [rcx+416], ymm2 + vmovdqu YMMWORD PTR [rcx+448], ymm1 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +mlkem_cbd_eta3_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_55 QWORD 5555555555555555h, 5555555555555555h + QWORD 5555555555555555h, 5555555555555555h +ptr_L_mlkem_mask_55 QWORD L_mlkem_mask_55 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_33 QWORD 3333333333333333h, 3333333333333333h + QWORD 3333333333333333h, 3333333333333333h +ptr_L_mlkem_mask_33 QWORD L_mlkem_mask_33 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_03 QWORD 0303030303030303h, 0303030303030303h + QWORD 0303030303030303h, 0303030303030303h +ptr_L_mlkem_mask_03 QWORD L_mlkem_mask_03 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_mask_0f QWORD 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh + QWORD 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh +ptr_L_mlkem_mask_0f QWORD L_mlkem_mask_0f +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_cbd_eta2_avx2 PROC + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu ymm8, YMMWORD PTR L_mlkem_mask_55 + vmovdqu ymm9, YMMWORD PTR L_mlkem_mask_33 + vmovdqu ymm10, YMMWORD PTR L_mlkem_mask_03 + vmovdqu ymm11, YMMWORD PTR L_mlkem_mask_0f + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vpsrlw ymm2, ymm0, 1 + vpsrlw ymm3, ymm1, 1 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddb ymm0, ymm0, ymm2 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm0, 2 + vpsrlw ymm3, ymm1, 2 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpand ymm2, ymm2, ymm9 + vpand ymm3, ymm3, ymm9 + vpaddb ymm0, ymm0, ymm9 + vpaddb ymm1, ymm1, ymm9 + vpsubb ymm0, ymm0, ymm2 + vpsubb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpand ymm0, ymm0, ymm11 + vpand ymm1, ymm1, ymm11 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpsubb ymm0, ymm0, ymm10 + vpsubb ymm1, ymm1, ymm10 + vpsubb ymm2, ymm2, ymm10 + vpsubb ymm3, ymm3, ymm10 + vpunpcklbw ymm4, ymm0, ymm2 + vpunpcklbw ymm5, ymm1, ymm3 + vpunpckhbw ymm6, ymm0, ymm2 + vpunpckhbw ymm7, ymm1, ymm3 + vpmovsxbw ymm0, xmm4 + vpmovsxbw ymm1, xmm5 + vextracti128 xmm2, ymm4, 1 + vextracti128 xmm3, ymm5, 1 + vpmovsxbw ymm2, xmm2 + vpmovsxbw ymm3, xmm3 + vpmovsxbw ymm4, xmm6 + vpmovsxbw ymm5, xmm7 + vextracti128 xmm6, ymm6, 1 + vextracti128 xmm7, ymm7, 1 + vpmovsxbw ymm6, xmm6 + vpmovsxbw ymm7, xmm7 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm6 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm3 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm1, YMMWORD PTR [rdx+96] + vpsrlw ymm2, ymm0, 1 + vpsrlw ymm3, ymm1, 1 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpaddb ymm0, ymm0, ymm2 + vpaddb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm0, 2 + vpsrlw ymm3, ymm1, 2 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpand ymm2, ymm2, ymm9 + vpand ymm3, ymm3, ymm9 + vpaddb ymm0, ymm0, ymm9 + vpaddb ymm1, ymm1, ymm9 + vpsubb ymm0, ymm0, ymm2 + vpsubb ymm1, ymm1, ymm3 + vpsrlw ymm2, ymm0, 4 + vpsrlw ymm3, ymm1, 4 + vpand ymm0, ymm0, ymm11 + vpand ymm1, ymm1, ymm11 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpsubb ymm0, ymm0, ymm10 + vpsubb ymm1, ymm1, ymm10 + vpsubb ymm2, ymm2, ymm10 + vpsubb ymm3, ymm3, ymm10 + vpunpcklbw ymm4, ymm0, ymm2 + vpunpcklbw ymm5, ymm1, ymm3 + vpunpckhbw ymm6, ymm0, ymm2 + vpunpckhbw ymm7, ymm1, ymm3 + vpmovsxbw ymm0, xmm4 + vpmovsxbw ymm1, xmm5 + vextracti128 xmm2, ymm4, 1 + vextracti128 xmm3, ymm5, 1 + vpmovsxbw ymm2, xmm2 + vpmovsxbw ymm3, xmm3 + vpmovsxbw ymm4, xmm6 + vpmovsxbw ymm5, xmm7 + vextracti128 xmm6, ymm6, 1 + vextracti128 xmm7, ymm7, 1 + vpmovsxbw ymm6, xmm6 + vpmovsxbw ymm7, xmm7 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm4 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm6 + vmovdqu YMMWORD PTR [rcx+384], ymm1 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm3 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + ret +mlkem_cbd_eta2_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_10_avx2_mask WORD 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh + WORD 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh, 03ffh +ptr_L_mlkem_compress_10_avx2_mask QWORD L_mlkem_compress_10_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_10_avx2_shift QWORD 0400000104000001h, 0400000104000001h + QWORD 0400000104000001h, 0400000104000001h +ptr_L_mlkem_compress_10_avx2_shift QWORD L_mlkem_compress_10_avx2_shift +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_10_avx2_shlv QWORD 000000000000000ch, 000000000000000ch + QWORD 000000000000000ch, 000000000000000ch +ptr_L_mlkem_compress_10_avx2_shlv QWORD L_mlkem_compress_10_avx2_shlv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_10_avx2_shuf BYTE 00h, 01h, 02h, 03h, 04h, 08h, 09h, 0ah + BYTE 0bh, 0ch, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 09h, 0ah, 0bh, 0ch, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 00h, 01h, 02h, 03h, 04h, 08h +ptr_L_mlkem_compress_10_avx2_shuf QWORD L_mlkem_compress_10_avx2_shuf +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_10_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh + WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh +ptr_L_mlkem_compress_10_avx2_v QWORD L_mlkem_compress_10_avx2_v +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_10_avx2_offset WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh + WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh +ptr_L_mlkem_compress_10_avx2_offset QWORD L_mlkem_compress_10_avx2_offset +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_10_avx2_shift12 WORD 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h + WORD 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h, 1000h +ptr_L_mlkem_compress_10_avx2_shift12 QWORD L_mlkem_compress_10_avx2_shift12 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_compress_10_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm9, YMMWORD PTR L_mlkem_compress_10_avx2_mask + vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_10_avx2_shift + vmovdqu ymm10, YMMWORD PTR L_mlkem_compress_10_avx2_shlv + vmovdqu ymm11, YMMWORD PTR L_mlkem_compress_10_avx2_shuf + vmovdqu ymm6, YMMWORD PTR L_mlkem_compress_10_avx2_v + vmovdqu ymm12, YMMWORD PTR L_mlkem_compress_10_avx2_offset + vmovdqu ymm13, YMMWORD PTR L_mlkem_compress_10_avx2_shift12 + vpsllw ymm7, ymm6, 3 +L_mlkem_compress_10_avx2_start: + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx], xmm0 + vmovdqu OWORD PTR [rcx+20], xmm1 + vmovss DWORD PTR [rcx+16], xmm2 + vmovss DWORD PTR [rcx+36], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm1, YMMWORD PTR [rdx+96] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx+40], xmm0 + vmovdqu OWORD PTR [rcx+60], xmm1 + vmovss DWORD PTR [rcx+56], xmm2 + vmovss DWORD PTR [rcx+76], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm1, YMMWORD PTR [rdx+160] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx+80], xmm0 + vmovdqu OWORD PTR [rcx+100], xmm1 + vmovss DWORD PTR [rcx+96], xmm2 + vmovss DWORD PTR [rcx+116], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm1, YMMWORD PTR [rdx+224] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx+120], xmm0 + vmovdqu OWORD PTR [rcx+140], xmm1 + vmovss DWORD PTR [rcx+136], xmm2 + vmovss DWORD PTR [rcx+156], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx+160], xmm0 + vmovdqu OWORD PTR [rcx+180], xmm1 + vmovss DWORD PTR [rcx+176], xmm2 + vmovss DWORD PTR [rcx+196], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm1, YMMWORD PTR [rdx+352] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx+200], xmm0 + vmovdqu OWORD PTR [rcx+220], xmm1 + vmovss DWORD PTR [rcx+216], xmm2 + vmovss DWORD PTR [rcx+236], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm1, YMMWORD PTR [rdx+416] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx+240], xmm0 + vmovdqu OWORD PTR [rcx+260], xmm1 + vmovss DWORD PTR [rcx+256], xmm2 + vmovss DWORD PTR [rcx+276], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm1, YMMWORD PTR [rdx+480] + vpmullw ymm2, ymm0, ymm7 + vpmullw ymm4, ymm1, ymm7 + vpaddw ymm3, ymm0, ymm12 + vpaddw ymm5, ymm1, ymm12 + vpsllw ymm0, ymm0, 3 + vpsllw ymm1, ymm1, 3 + vpmulhuw ymm0, ymm0, ymm6 + vpmulhuw ymm1, ymm1, ymm6 + vpsubw ymm3, ymm2, ymm3 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm2, ymm2, ymm3 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm2, ymm2, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm2 + vpsubw ymm1, ymm1, ymm4 + vpmulhrsw ymm0, ymm0, ymm13 + vpmulhrsw ymm1, ymm1, ymm13 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpmaddwd ymm0, ymm0, ymm8 + vpmaddwd ymm1, ymm1, ymm8 + vpsllvd ymm0, ymm0, ymm10 + vpsllvd ymm1, ymm1, ymm10 + vpsrlq ymm0, ymm0, 12 + vpsrlq ymm1, ymm1, 12 + vpshufb ymm0, ymm0, ymm11 + vpshufb ymm1, ymm1, ymm11 + vextracti128 xmm2, ymm0, 1 + vextracti128 xmm4, ymm1, 1 + vpblendw xmm0, xmm0, xmm2, 224 + vpblendw xmm1, xmm1, xmm4, 224 + vmovdqu OWORD PTR [rcx+280], xmm0 + vmovdqu OWORD PTR [rcx+300], xmm1 + vmovss DWORD PTR [rcx+296], xmm2 + vmovss DWORD PTR [rcx+316], xmm4 + add rcx, 320 + add rdx, 512 + sub r8d, 1 + jg L_mlkem_compress_10_avx2_start + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +mlkem_compress_10_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_10_avx2_mask DWORD 7fe01ff8h, 7fe01ff8h, 7fe01ff8h, 7fe01ff8h + DWORD 7fe01ff8h, 7fe01ff8h, 7fe01ff8h, 7fe01ff8h +ptr_L_mlkem_decompress_10_avx2_mask QWORD L_mlkem_decompress_10_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_10_avx2_sllv QWORD 0000000000000004h, 0000000000000004h + QWORD 0000000000000004h, 0000000000000004h +ptr_L_mlkem_decompress_10_avx2_sllv QWORD L_mlkem_decompress_10_avx2_sllv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_10_avx2_q DWORD 0d013404h, 0d013404h, 0d013404h, 0d013404h + DWORD 0d013404h, 0d013404h, 0d013404h, 0d013404h +ptr_L_mlkem_decompress_10_avx2_q QWORD L_mlkem_decompress_10_avx2_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_10_avx2_shuf BYTE 00h, 01h, 01h, 02h, 02h, 03h, 03h, 04h + BYTE 05h, 06h, 06h, 07h, 07h, 08h, 08h, 09h + BYTE 02h, 03h, 03h, 04h, 04h, 05h, 05h, 06h + BYTE 07h, 08h, 08h, 09h, 09h, 0ah, 0ah, 0bh +ptr_L_mlkem_decompress_10_avx2_shuf QWORD L_mlkem_decompress_10_avx2_shuf +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_decompress_10_avx2 PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_10_avx2_mask + vmovdqu ymm5, YMMWORD PTR L_mlkem_decompress_10_avx2_q + vmovdqu ymm6, YMMWORD PTR L_mlkem_decompress_10_avx2_shuf + vmovdqu ymm7, YMMWORD PTR L_mlkem_decompress_10_avx2_sllv +L_mlkem_decompress_10_avx2_start: + vpermq ymm0, [rdx], 148 + vpermq ymm1, [rdx+20], 148 + vpermq ymm2, [rdx+40], 148 + vpermq ymm3, [rdx+60], 148 + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm5 + vpmulhrsw ymm1, ymm1, ymm5 + vpmulhrsw ymm2, ymm2, ymm5 + vpmulhrsw ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vpermq ymm0, [rdx+80], 148 + vpermq ymm1, [rdx+100], 148 + vpermq ymm2, [rdx+120], 148 + vpermq ymm3, [rdx+140], 148 + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm5 + vpmulhrsw ymm1, ymm1, ymm5 + vpmulhrsw ymm2, ymm2, ymm5 + vpmulhrsw ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm2 + vmovdqu YMMWORD PTR [rcx+224], ymm3 + vpermq ymm0, [rdx+160], 148 + vpermq ymm1, [rdx+180], 148 + vpermq ymm2, [rdx+200], 148 + vpermq ymm3, [rdx+220], 148 + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm5 + vpmulhrsw ymm1, ymm1, ymm5 + vpmulhrsw ymm2, ymm2, ymm5 + vpmulhrsw ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vpermq ymm0, [rdx+240], 148 + vpermq ymm1, [rdx+260], 148 + vpermq ymm2, [rdx+280], 148 + vpermq ymm3, [rdx+300], 148 + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsllvd ymm1, ymm1, ymm7 + vpsllvd ymm2, ymm2, ymm7 + vpsllvd ymm3, ymm3, ymm7 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm5 + vpmulhrsw ymm1, ymm1, ymm5 + vpmulhrsw ymm2, ymm2, ymm5 + vpmulhrsw ymm3, ymm3, ymm5 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + vmovdqu YMMWORD PTR [rcx+416], ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + add rdx, 320 + add rcx, 512 + sub r8d, 1 + jg L_mlkem_decompress_10_avx2_start + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +mlkem_decompress_10_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh + WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh +ptr_L_mlkem_compress_11_avx2_v QWORD L_mlkem_compress_11_avx2_v +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_off WORD 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h + WORD 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h, 0024h +ptr_L_mlkem_compress_11_avx2_off QWORD L_mlkem_compress_11_avx2_off +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_shift13 WORD 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h + WORD 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h, 2000h +ptr_L_mlkem_compress_11_avx2_shift13 QWORD L_mlkem_compress_11_avx2_shift13 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_mask WORD 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh + WORD 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh, 07ffh +ptr_L_mlkem_compress_11_avx2_mask QWORD L_mlkem_compress_11_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_shift QWORD 0800000108000001h, 0800000108000001h + QWORD 0800000108000001h, 0800000108000001h +ptr_L_mlkem_compress_11_avx2_shift QWORD L_mlkem_compress_11_avx2_shift +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_sllvd DWORD 0000000ah, 00000000h, 0000000ah, 00000000h + DWORD 0000000ah, 00000000h, 0000000ah, 00000000h +ptr_L_mlkem_compress_11_avx2_sllvd QWORD L_mlkem_compress_11_avx2_sllvd +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_srlvq QWORD 000000000000000ah, 000000000000001eh + QWORD 000000000000000ah, 000000000000001eh +ptr_L_mlkem_compress_11_avx2_srlvq QWORD L_mlkem_compress_11_avx2_srlvq +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_11_avx2_shuf BYTE 00h, 01h, 02h, 03h, 04h, 05h, 06h, 07h + BYTE 08h, 09h, 0ah, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 05h, 06h, 07h, 08h, 09h, 0ah, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 00h, 00h, 01h, 02h, 03h, 04h +ptr_L_mlkem_compress_11_avx2_shuf QWORD L_mlkem_compress_11_avx2_shuf +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_compress_11_avx2 PROC + sub rsp, 144 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm7, YMMWORD PTR L_mlkem_compress_11_avx2_v + vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_11_avx2_off + vmovdqu ymm9, YMMWORD PTR L_mlkem_compress_11_avx2_shift13 + vmovdqu ymm10, YMMWORD PTR L_mlkem_compress_11_avx2_mask + vmovdqu ymm11, YMMWORD PTR L_mlkem_compress_11_avx2_shift + vmovdqu ymm12, YMMWORD PTR L_mlkem_compress_11_avx2_sllvd + vmovdqu ymm13, YMMWORD PTR L_mlkem_compress_11_avx2_srlvq + vmovdqu ymm14, YMMWORD PTR L_mlkem_compress_11_avx2_shuf + vpsllw ymm6, ymm7, 3 +L_mlkem_compress_11_avx2_start: + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm3, YMMWORD PTR [rdx+32] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx], xmm0 + vmovq QWORD PTR [rcx+16], xmm1 + vmovdqu OWORD PTR [rcx+22], xmm3 + vmovq QWORD PTR [rcx+38], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx+44], xmm0 + vmovq QWORD PTR [rcx+60], xmm1 + vmovdqu OWORD PTR [rcx+66], xmm3 + vmovq QWORD PTR [rcx+82], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+128] + vmovdqu ymm3, YMMWORD PTR [rdx+160] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx+88], xmm0 + vmovq QWORD PTR [rcx+104], xmm1 + vmovdqu OWORD PTR [rcx+110], xmm3 + vmovq QWORD PTR [rcx+126], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm3, YMMWORD PTR [rdx+224] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx+132], xmm0 + vmovq QWORD PTR [rcx+148], xmm1 + vmovdqu OWORD PTR [rcx+154], xmm3 + vmovq QWORD PTR [rcx+170], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm3, YMMWORD PTR [rdx+288] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx+176], xmm0 + vmovq QWORD PTR [rcx+192], xmm1 + vmovdqu OWORD PTR [rcx+198], xmm3 + vmovq QWORD PTR [rcx+214], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+320] + vmovdqu ymm3, YMMWORD PTR [rdx+352] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx+220], xmm0 + vmovq QWORD PTR [rcx+236], xmm1 + vmovdqu OWORD PTR [rcx+242], xmm3 + vmovq QWORD PTR [rcx+258], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+384] + vmovdqu ymm3, YMMWORD PTR [rdx+416] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx+264], xmm0 + vmovq QWORD PTR [rcx+280], xmm1 + vmovdqu OWORD PTR [rcx+286], xmm3 + vmovq QWORD PTR [rcx+302], xmm4 + vmovdqu ymm0, YMMWORD PTR [rdx+448] + vmovdqu ymm3, YMMWORD PTR [rdx+480] + vpmullw ymm1, ymm0, ymm6 + vpmullw ymm4, ymm3, ymm6 + vpaddw ymm2, ymm0, ymm8 + vpaddw ymm5, ymm3, ymm8 + vpsllw ymm0, ymm0, 3 + vpsllw ymm3, ymm3, 3 + vpmulhw ymm0, ymm0, ymm7 + vpmulhw ymm3, ymm3, ymm7 + vpsubw ymm2, ymm1, ymm2 + vpsubw ymm5, ymm4, ymm5 + vpandn ymm1, ymm1, ymm2 + vpandn ymm4, ymm4, ymm5 + vpsrlw ymm1, ymm1, 15 + vpsrlw ymm4, ymm4, 15 + vpsubw ymm0, ymm0, ymm1 + vpsubw ymm3, ymm3, ymm4 + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm10 + vpand ymm3, ymm3, ymm10 + vpmaddwd ymm0, ymm0, ymm11 + vpmaddwd ymm3, ymm3, ymm11 + vpsllvd ymm0, ymm0, ymm12 + vpsllvd ymm3, ymm3, ymm12 + vpsrldq ymm1, ymm0, 8 + vpsrldq ymm4, ymm3, 8 + vpsrlvq ymm0, ymm0, ymm13 + vpsrlvq ymm3, ymm3, ymm13 + vpsllq ymm1, ymm1, 34 + vpsllq ymm4, ymm4, 34 + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm3, ymm3, ymm4 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm3, ymm3, ymm14 + vextracti128 xmm1, ymm0, 1 + vextracti128 xmm4, ymm3, 1 + vpblendvb xmm0, xmm0, xmm1, xmm14 + vpblendvb xmm3, xmm3, xmm4, xmm14 + vmovdqu OWORD PTR [rcx+308], xmm0 + vmovq QWORD PTR [rcx+324], xmm1 + vmovdqu OWORD PTR [rcx+330], xmm3 + vmovq QWORD PTR [rcx+346], xmm4 + add rcx, 352 + add rdx, 512 + sub r8d, 1 + jg L_mlkem_compress_11_avx2_start + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + add rsp, 144 + ret +mlkem_compress_11_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_11_avx2_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h + WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h +ptr_L_mlkem_decompress_11_avx2_q QWORD L_mlkem_decompress_11_avx2_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_11_avx2_shuf BYTE 00h, 01h, 01h, 02h, 02h, 03h, 04h, 05h + BYTE 05h, 06h, 06h, 07h, 08h, 09h, 09h, 0ah + BYTE 03h, 04h, 04h, 05h, 05h, 06h, 07h, 08h + BYTE 08h, 09h, 09h, 0ah, 0bh, 0ch, 0ch, 0dh +ptr_L_mlkem_decompress_11_avx2_shuf QWORD L_mlkem_decompress_11_avx2_shuf +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_11_avx2_sllv DWORD 00000000h, 00000001h, 00000000h, 00000000h + DWORD 00000000h, 00000001h, 00000000h, 00000000h +ptr_L_mlkem_decompress_11_avx2_sllv QWORD L_mlkem_decompress_11_avx2_sllv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_11_avx2_srlv QWORD 0000000000000000h, 0000000000000002h + QWORD 0000000000000000h, 0000000000000002h +ptr_L_mlkem_decompress_11_avx2_srlv QWORD L_mlkem_decompress_11_avx2_srlv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_11_avx2_shift WORD 0020h, 0004h, 0001h, 0020h, 0008h, 0001h, 0020h, 0004h + WORD 0020h, 0004h, 0001h, 0020h, 0008h, 0001h, 0020h, 0004h +ptr_L_mlkem_decompress_11_avx2_shift QWORD L_mlkem_decompress_11_avx2_shift +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_11_avx2_mask WORD 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h + WORD 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h, 7ff0h +ptr_L_mlkem_decompress_11_avx2_mask QWORD L_mlkem_decompress_11_avx2_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_decompress_11_avx2 PROC + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_11_avx2_q + vmovdqu ymm5, YMMWORD PTR L_mlkem_decompress_11_avx2_shuf + vmovdqu ymm6, YMMWORD PTR L_mlkem_decompress_11_avx2_sllv + vmovdqu ymm7, YMMWORD PTR L_mlkem_decompress_11_avx2_srlv + vmovdqu ymm8, YMMWORD PTR L_mlkem_decompress_11_avx2_shift + vmovdqu ymm9, YMMWORD PTR L_mlkem_decompress_11_avx2_mask +L_mlkem_decompress_11_avx2_start: + vpermq ymm0, [rdx], 148 + vpermq ymm1, [rdx+22], 148 + vpermq ymm2, [rdx+44], 148 + vpermq ymm3, [rdx+66], 148 + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + vpshufb ymm2, ymm2, ymm5 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm0, ymm0, ymm6 + vpsrlvd ymm1, ymm1, ymm6 + vpsrlvd ymm2, ymm2, ymm6 + vpsrlvd ymm3, ymm3, ymm6 + vpsrlvq ymm0, ymm0, ymm7 + vpsrlvq ymm1, ymm1, ymm7 + vpsrlvq ymm2, ymm2, ymm7 + vpsrlvq ymm3, ymm3, ymm7 + vpmullw ymm0, ymm0, ymm8 + vpmullw ymm1, ymm1, ymm8 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpand ymm2, ymm2, ymm9 + vpand ymm3, ymm3, ymm9 + vpmulhrsw ymm0, ymm0, ymm4 + vpmulhrsw ymm1, ymm1, ymm4 + vpmulhrsw ymm2, ymm2, ymm4 + vpmulhrsw ymm3, ymm3, ymm4 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vpermq ymm0, [rdx+88], 148 + vpermq ymm1, [rdx+110], 148 + vpermq ymm2, [rdx+132], 148 + vpermq ymm3, [rdx+154], 148 + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + vpshufb ymm2, ymm2, ymm5 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm0, ymm0, ymm6 + vpsrlvd ymm1, ymm1, ymm6 + vpsrlvd ymm2, ymm2, ymm6 + vpsrlvd ymm3, ymm3, ymm6 + vpsrlvq ymm0, ymm0, ymm7 + vpsrlvq ymm1, ymm1, ymm7 + vpsrlvq ymm2, ymm2, ymm7 + vpsrlvq ymm3, ymm3, ymm7 + vpmullw ymm0, ymm0, ymm8 + vpmullw ymm1, ymm1, ymm8 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpand ymm2, ymm2, ymm9 + vpand ymm3, ymm3, ymm9 + vpmulhrsw ymm0, ymm0, ymm4 + vpmulhrsw ymm1, ymm1, ymm4 + vpmulhrsw ymm2, ymm2, ymm4 + vpmulhrsw ymm3, ymm3, ymm4 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm2 + vmovdqu YMMWORD PTR [rcx+224], ymm3 + vpermq ymm0, [rdx+176], 148 + vpermq ymm1, [rdx+198], 148 + vpermq ymm2, [rdx+220], 148 + vpermq ymm3, [rdx+242], 148 + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + vpshufb ymm2, ymm2, ymm5 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm0, ymm0, ymm6 + vpsrlvd ymm1, ymm1, ymm6 + vpsrlvd ymm2, ymm2, ymm6 + vpsrlvd ymm3, ymm3, ymm6 + vpsrlvq ymm0, ymm0, ymm7 + vpsrlvq ymm1, ymm1, ymm7 + vpsrlvq ymm2, ymm2, ymm7 + vpsrlvq ymm3, ymm3, ymm7 + vpmullw ymm0, ymm0, ymm8 + vpmullw ymm1, ymm1, ymm8 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpand ymm2, ymm2, ymm9 + vpand ymm3, ymm3, ymm9 + vpmulhrsw ymm0, ymm0, ymm4 + vpmulhrsw ymm1, ymm1, ymm4 + vpmulhrsw ymm2, ymm2, ymm4 + vpmulhrsw ymm3, ymm3, ymm4 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vpermq ymm0, [rdx+264], 148 + vpermq ymm1, [rdx+286], 148 + vpermq ymm2, [rdx+308], 148 + vpermq ymm3, [rdx+330], 148 + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + vpshufb ymm2, ymm2, ymm5 + vpshufb ymm3, ymm3, ymm5 + vpsrlvd ymm0, ymm0, ymm6 + vpsrlvd ymm1, ymm1, ymm6 + vpsrlvd ymm2, ymm2, ymm6 + vpsrlvd ymm3, ymm3, ymm6 + vpsrlvq ymm0, ymm0, ymm7 + vpsrlvq ymm1, ymm1, ymm7 + vpsrlvq ymm2, ymm2, ymm7 + vpsrlvq ymm3, ymm3, ymm7 + vpmullw ymm0, ymm0, ymm8 + vpmullw ymm1, ymm1, ymm8 + vpmullw ymm2, ymm2, ymm8 + vpmullw ymm3, ymm3, ymm8 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 + vpsrlw ymm2, ymm2, 1 + vpsrlw ymm3, ymm3, 1 + vpand ymm0, ymm0, ymm9 + vpand ymm1, ymm1, ymm9 + vpand ymm2, ymm2, ymm9 + vpand ymm3, ymm3, ymm9 + vpmulhrsw ymm0, ymm0, ymm4 + vpmulhrsw ymm1, ymm1, ymm4 + vpmulhrsw ymm2, ymm2, ymm4 + vpmulhrsw ymm3, ymm3, ymm4 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + vmovdqu YMMWORD PTR [rcx+416], ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + add rdx, 352 + add rcx, 512 + sub r8d, 1 + jg L_mlkem_decompress_11_avx2_start + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + ret +mlkem_decompress_11_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_4_avx2_mask WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh + WORD 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh, 000fh +ptr_L_mlkem_compress_4_avx2_mask QWORD L_mlkem_compress_4_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_4_avx2_shift WORD 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h + WORD 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h, 0200h +ptr_L_mlkem_compress_4_avx2_shift QWORD L_mlkem_compress_4_avx2_shift +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_4_avx2_perm DWORD 00000000h, 00000004h, 00000001h, 00000005h + DWORD 00000002h, 00000006h, 00000003h, 00000007h +ptr_L_mlkem_compress_4_avx2_perm QWORD L_mlkem_compress_4_avx2_perm +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_4_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh + WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh +ptr_L_mlkem_compress_4_avx2_v QWORD L_mlkem_compress_4_avx2_v +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_4_avx2_shift12 WORD 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h + WORD 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h, 1001h +ptr_L_mlkem_compress_4_avx2_shift12 QWORD L_mlkem_compress_4_avx2_shift12 +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_compress_4_avx2 PROC + sub rsp, 112 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_4_avx2_mask + vmovdqu ymm9, YMMWORD PTR L_mlkem_compress_4_avx2_shift + vmovdqu ymm10, YMMWORD PTR L_mlkem_compress_4_avx2_perm + vmovdqu ymm11, YMMWORD PTR L_mlkem_compress_4_avx2_v + vmovdqu ymm12, YMMWORD PTR L_mlkem_compress_4_avx2_shift12 + vpmulhw ymm0, ymm11, [rdx] + vpmulhw ymm1, ymm11, [rdx+32] + vpmulhw ymm2, ymm11, [rdx+64] + vpmulhw ymm3, ymm11, [rdx+96] + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm1, ymm1, ymm9 + vpmulhrsw ymm2, ymm2, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpmaddubsw ymm0, ymm0, ymm12 + vpmaddubsw ymm2, ymm2, ymm12 + vpackuswb ymm0, ymm0, ymm2 + vpmulhw ymm4, ymm11, [rdx+128] + vpmulhw ymm5, ymm11, [rdx+160] + vpmulhw ymm6, ymm11, [rdx+192] + vpmulhw ymm7, ymm11, [rdx+224] + vpmulhrsw ymm4, ymm4, ymm9 + vpmulhrsw ymm5, ymm5, ymm9 + vpmulhrsw ymm6, ymm6, ymm9 + vpmulhrsw ymm7, ymm7, ymm9 + vpand ymm4, ymm4, ymm8 + vpand ymm5, ymm5, ymm8 + vpand ymm6, ymm6, ymm8 + vpand ymm7, ymm7, ymm8 + vpackuswb ymm4, ymm4, ymm5 + vpackuswb ymm6, ymm6, ymm7 + vpmaddubsw ymm4, ymm4, ymm12 + vpmaddubsw ymm6, ymm6, ymm12 + vpackuswb ymm4, ymm4, ymm6 + vpermd ymm0, ymm10, ymm0 + vpermd ymm4, ymm10, ymm4 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm4 + vpmulhw ymm0, ymm11, [rdx+256] + vpmulhw ymm1, ymm11, [rdx+288] + vpmulhw ymm2, ymm11, [rdx+320] + vpmulhw ymm3, ymm11, [rdx+352] + vpmulhrsw ymm0, ymm0, ymm9 + vpmulhrsw ymm1, ymm1, ymm9 + vpmulhrsw ymm2, ymm2, ymm9 + vpmulhrsw ymm3, ymm3, ymm9 + vpand ymm0, ymm0, ymm8 + vpand ymm1, ymm1, ymm8 + vpand ymm2, ymm2, ymm8 + vpand ymm3, ymm3, ymm8 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpmaddubsw ymm0, ymm0, ymm12 + vpmaddubsw ymm2, ymm2, ymm12 + vpackuswb ymm0, ymm0, ymm2 + vpmulhw ymm4, ymm11, [rdx+384] + vpmulhw ymm5, ymm11, [rdx+416] + vpmulhw ymm6, ymm11, [rdx+448] + vpmulhw ymm7, ymm11, [rdx+480] + vpmulhrsw ymm4, ymm4, ymm9 + vpmulhrsw ymm5, ymm5, ymm9 + vpmulhrsw ymm6, ymm6, ymm9 + vpmulhrsw ymm7, ymm7, ymm9 + vpand ymm4, ymm4, ymm8 + vpand ymm5, ymm5, ymm8 + vpand ymm6, ymm6, ymm8 + vpand ymm7, ymm7, ymm8 + vpackuswb ymm4, ymm4, ymm5 + vpackuswb ymm6, ymm6, ymm7 + vpmaddubsw ymm4, ymm4, ymm12 + vpmaddubsw ymm6, ymm6, ymm12 + vpackuswb ymm4, ymm4, ymm6 + vpermd ymm0, ymm10, ymm0 + vpermd ymm4, ymm10, ymm4 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vmovdqu YMMWORD PTR [rcx+96], ymm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + add rsp, 112 + ret +mlkem_compress_4_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_4_avx2_mask DWORD 00f0000fh, 00f0000fh, 00f0000fh, 00f0000fh + DWORD 00f0000fh, 00f0000fh, 00f0000fh, 00f0000fh +ptr_L_mlkem_decompress_4_avx2_mask QWORD L_mlkem_decompress_4_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_4_avx2_shift DWORD 00800800h, 00800800h, 00800800h, 00800800h + DWORD 00800800h, 00800800h, 00800800h, 00800800h +ptr_L_mlkem_decompress_4_avx2_shift QWORD L_mlkem_decompress_4_avx2_shift +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_4_avx2_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h + WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h +ptr_L_mlkem_decompress_4_avx2_q QWORD L_mlkem_decompress_4_avx2_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_4_avx2_shuf BYTE 00h, 00h, 00h, 00h, 01h, 01h, 01h, 01h + BYTE 02h, 02h, 02h, 02h, 03h, 03h, 03h, 03h + BYTE 04h, 04h, 04h, 04h, 05h, 05h, 05h, 05h + BYTE 06h, 06h, 06h, 06h, 07h, 07h, 07h, 07h +ptr_L_mlkem_decompress_4_avx2_shuf QWORD L_mlkem_decompress_4_avx2_shuf +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_decompress_4_avx2 PROC + sub rsp, 32 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_4_avx2_mask + vmovdqu ymm5, YMMWORD PTR L_mlkem_decompress_4_avx2_shift + vmovdqu ymm6, YMMWORD PTR L_mlkem_decompress_4_avx2_shuf + vmovdqu ymm7, YMMWORD PTR L_mlkem_decompress_4_avx2_q + vpbroadcastq ymm0, QWORD PTR [rdx] + vpbroadcastq ymm1, QWORD PTR [rdx+8] + vpbroadcastq ymm2, QWORD PTR [rdx+16] + vpbroadcastq ymm3, QWORD PTR [rdx+24] + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmullw ymm0, ymm0, ymm5 + vpmullw ymm1, ymm1, ymm5 + vpmullw ymm2, ymm2, ymm5 + vpmullw ymm3, ymm3, ymm5 + vpmulhrsw ymm0, ymm0, ymm7 + vpmulhrsw ymm1, ymm1, ymm7 + vpmulhrsw ymm2, ymm2, ymm7 + vpmulhrsw ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vpbroadcastq ymm0, QWORD PTR [rdx+32] + vpbroadcastq ymm1, QWORD PTR [rdx+40] + vpbroadcastq ymm2, QWORD PTR [rdx+48] + vpbroadcastq ymm3, QWORD PTR [rdx+56] + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmullw ymm0, ymm0, ymm5 + vpmullw ymm1, ymm1, ymm5 + vpmullw ymm2, ymm2, ymm5 + vpmullw ymm3, ymm3, ymm5 + vpmulhrsw ymm0, ymm0, ymm7 + vpmulhrsw ymm1, ymm1, ymm7 + vpmulhrsw ymm2, ymm2, ymm7 + vpmulhrsw ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vmovdqu YMMWORD PTR [rcx+160], ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm2 + vmovdqu YMMWORD PTR [rcx+224], ymm3 + vpbroadcastq ymm0, QWORD PTR [rdx+64] + vpbroadcastq ymm1, QWORD PTR [rdx+72] + vpbroadcastq ymm2, QWORD PTR [rdx+80] + vpbroadcastq ymm3, QWORD PTR [rdx+88] + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmullw ymm0, ymm0, ymm5 + vpmullw ymm1, ymm1, ymm5 + vpmullw ymm2, ymm2, ymm5 + vpmullw ymm3, ymm3, ymm5 + vpmulhrsw ymm0, ymm0, ymm7 + vpmulhrsw ymm1, ymm1, ymm7 + vpmulhrsw ymm2, ymm2, ymm7 + vpmulhrsw ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vpbroadcastq ymm0, QWORD PTR [rdx+96] + vpbroadcastq ymm1, QWORD PTR [rdx+104] + vpbroadcastq ymm2, QWORD PTR [rdx+112] + vpbroadcastq ymm3, QWORD PTR [rdx+120] + vpshufb ymm0, ymm0, ymm6 + vpshufb ymm1, ymm1, ymm6 + vpshufb ymm2, ymm2, ymm6 + vpshufb ymm3, ymm3, ymm6 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpand ymm2, ymm2, ymm4 + vpand ymm3, ymm3, ymm4 + vpmullw ymm0, ymm0, ymm5 + vpmullw ymm1, ymm1, ymm5 + vpmullw ymm2, ymm2, ymm5 + vpmullw ymm3, ymm3, ymm5 + vpmulhrsw ymm0, ymm0, ymm7 + vpmulhrsw ymm1, ymm1, ymm7 + vpmulhrsw ymm2, ymm2, ymm7 + vpmulhrsw ymm3, ymm3, ymm7 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + vmovdqu YMMWORD PTR [rcx+416], ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm2 + vmovdqu YMMWORD PTR [rcx+480], ymm3 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + add rsp, 32 + ret +mlkem_decompress_4_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_5_avx2_v WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh + WORD 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh, 4ebfh +ptr_L_mlkem_compress_5_avx2_v QWORD L_mlkem_compress_5_avx2_v +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_5_avx2_shift WORD 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h + WORD 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h, 0400h +ptr_L_mlkem_compress_5_avx2_shift QWORD L_mlkem_compress_5_avx2_shift +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_5_avx2_mask WORD 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh + WORD 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh, 001fh +ptr_L_mlkem_compress_5_avx2_mask QWORD L_mlkem_compress_5_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_5_avx2_shift1 WORD 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h + WORD 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h, 2001h +ptr_L_mlkem_compress_5_avx2_shift1 QWORD L_mlkem_compress_5_avx2_shift1 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_5_avx2_shift2 DWORD 04000001h, 04000001h, 04000001h, 04000001h + DWORD 04000001h, 04000001h, 04000001h, 04000001h +ptr_L_mlkem_compress_5_avx2_shift2 QWORD L_mlkem_compress_5_avx2_shift2 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_5_avx2_shlv QWORD 000000000000000ch, 000000000000000ch + QWORD 000000000000000ch, 000000000000000ch +ptr_L_mlkem_compress_5_avx2_shlv QWORD L_mlkem_compress_5_avx2_shlv +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_compress_5_avx2_shuffle BYTE 00h, 01h, 02h, 03h, 04h, 0ffh, 0ffh, 0ffh + BYTE 0ffh, 0ffh, 08h, 09h, 0ah, 0bh, 0ch, 0ffh + BYTE 09h, 0ah, 0bh, 0ch, 0ffh, 00h, 01h, 02h + BYTE 03h, 04h, 0ffh, 0ffh, 0ffh, 0ffh, 0ffh, 08h +ptr_L_mlkem_compress_5_avx2_shuffle QWORD L_mlkem_compress_5_avx2_shuffle +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_compress_5_avx2 PROC + sub rsp, 48 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm2, YMMWORD PTR L_mlkem_compress_5_avx2_v + vmovdqu ymm3, YMMWORD PTR L_mlkem_compress_5_avx2_shift + vmovdqu ymm4, YMMWORD PTR L_mlkem_compress_5_avx2_mask + vmovdqu ymm5, YMMWORD PTR L_mlkem_compress_5_avx2_shift1 + vmovdqu ymm6, YMMWORD PTR L_mlkem_compress_5_avx2_shift2 + vmovdqu ymm7, YMMWORD PTR L_mlkem_compress_5_avx2_shlv + vmovdqu ymm8, YMMWORD PTR L_mlkem_compress_5_avx2_shuffle + vpmulhw ymm0, ymm2, [rdx] + vpmulhw ymm1, ymm2, [rdx+32] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx], xmm0 + movss DWORD PTR [rcx+16], xmm1 + vpmulhw ymm0, ymm2, [rdx+64] + vpmulhw ymm1, ymm2, [rdx+96] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx+20], xmm0 + movss DWORD PTR [rcx+36], xmm1 + vpmulhw ymm0, ymm2, [rdx+128] + vpmulhw ymm1, ymm2, [rdx+160] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx+40], xmm0 + movss DWORD PTR [rcx+56], xmm1 + vpmulhw ymm0, ymm2, [rdx+192] + vpmulhw ymm1, ymm2, [rdx+224] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx+60], xmm0 + movss DWORD PTR [rcx+76], xmm1 + vpmulhw ymm0, ymm2, [rdx+256] + vpmulhw ymm1, ymm2, [rdx+288] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx+80], xmm0 + movss DWORD PTR [rcx+96], xmm1 + vpmulhw ymm0, ymm2, [rdx+320] + vpmulhw ymm1, ymm2, [rdx+352] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx+100], xmm0 + movss DWORD PTR [rcx+116], xmm1 + vpmulhw ymm0, ymm2, [rdx+384] + vpmulhw ymm1, ymm2, [rdx+416] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx+120], xmm0 + movss DWORD PTR [rcx+136], xmm1 + vpmulhw ymm0, ymm2, [rdx+448] + vpmulhw ymm1, ymm2, [rdx+480] + vpmulhrsw ymm0, ymm0, ymm3 + vpmulhrsw ymm1, ymm1, ymm3 + vpand ymm0, ymm0, ymm4 + vpand ymm1, ymm1, ymm4 + vpackuswb ymm0, ymm0, ymm1 + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddwd ymm0, ymm0, ymm6 + vpsllvd ymm0, ymm0, ymm7 + vpsrlvq ymm0, ymm0, ymm7 + vpshufb ymm0, ymm0, ymm8 + vextracti128 xmm1, ymm0, 1 + vpblendvb xmm0, xmm0, xmm1, xmm8 + vmovdqu OWORD PTR [rcx+140], xmm0 + movss DWORD PTR [rcx+156], xmm1 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + add rsp, 48 + ret +mlkem_compress_5_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_5_avx2_q WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h + WORD 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h, 0d01h +ptr_L_mlkem_decompress_5_avx2_q QWORD L_mlkem_decompress_5_avx2_q +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_5_avx2_shuf BYTE 00h, 00h, 00h, 01h, 01h, 01h, 01h, 02h + BYTE 02h, 03h, 03h, 03h, 03h, 04h, 04h, 04h + BYTE 05h, 05h, 05h, 06h, 06h, 06h, 06h, 07h + BYTE 07h, 08h, 08h, 08h, 08h, 09h, 09h, 09h +ptr_L_mlkem_decompress_5_avx2_shuf QWORD L_mlkem_decompress_5_avx2_shuf +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_5_avx2_mask WORD 001fh, 03e0h, 007ch, 0f80h, 01f0h, 003eh, 07c0h, 00f8h + WORD 001fh, 03e0h, 007ch, 0f80h, 01f0h, 003eh, 07c0h, 00f8h +ptr_L_mlkem_decompress_5_avx2_mask QWORD L_mlkem_decompress_5_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_decompress_5_avx2_shift WORD 0400h, 0020h, 0100h, 0008h, 0040h, 0200h, 0010h, 0080h + WORD 0400h, 0020h, 0100h, 0008h, 0040h, 0200h, 0010h, 0080h +ptr_L_mlkem_decompress_5_avx2_shift QWORD L_mlkem_decompress_5_avx2_shift +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_decompress_5_avx2 PROC + vmovdqu ymm1, YMMWORD PTR L_mlkem_decompress_5_avx2_q + vmovdqu ymm2, YMMWORD PTR L_mlkem_decompress_5_avx2_shuf + vmovdqu ymm3, YMMWORD PTR L_mlkem_decompress_5_avx2_mask + vmovdqu ymm4, YMMWORD PTR L_mlkem_decompress_5_avx2_shift + vbroadcasti128 ymm0, OWORD PTR [rdx] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+10] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+20] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+30] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+40] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+128], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+50] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+160], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+60] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+192], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+70] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+224], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+80] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+90] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+288], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+100] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+110] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+352], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+120] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+384], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+130] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+416], ymm0 + vbroadcasti128 ymm0, OWORD PTR [rdx+140] + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+448], ymm0 + vmovq xmm0, QWORD PTR [rdx+150] + movzx rax, WORD PTR [rdx+158] + vpinsrq xmm0, xmm0, rax, 1 + vinserti128 ymm0, ymm0, xmm0, 1 + vpshufb ymm0, ymm0, ymm2 + vpand ymm0, ymm0, ymm3 + vpmullw ymm0, ymm0, ymm4 + vpmulhrsw ymm0, ymm0, ymm1 + vmovdqu YMMWORD PTR [rcx+480], ymm0 + vzeroupper + ret +mlkem_decompress_5_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_from_msg_avx2_shift DWORD 00000003h, 00000002h, 00000001h, 00000000h + DWORD 00000003h, 00000002h, 00000001h, 00000000h +ptr_L_mlkem_from_msg_avx2_shift QWORD L_mlkem_from_msg_avx2_shift +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_from_msg_avx2_shuf BYTE 00h, 01h, 04h, 05h, 08h, 09h, 0ch, 0dh + BYTE 02h, 03h, 06h, 07h, 0ah, 0bh, 0eh, 0fh + BYTE 00h, 01h, 04h, 05h, 08h, 09h, 0ch, 0dh + BYTE 02h, 03h, 06h, 07h, 0ah, 0bh, 0eh, 0fh +ptr_L_mlkem_from_msg_avx2_shuf QWORD L_mlkem_from_msg_avx2_shuf +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_from_msg_avx2_hqs WORD 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h + WORD 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h, 0681h +ptr_L_mlkem_from_msg_avx2_hqs QWORD L_mlkem_from_msg_avx2_hqs +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_from_msg_avx2 PROC + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm9, YMMWORD PTR L_mlkem_from_msg_avx2_shift + vmovdqu ymm10, YMMWORD PTR L_mlkem_from_msg_avx2_shuf + vmovdqu ymm11, YMMWORD PTR L_mlkem_from_msg_avx2_hqs + vpshufd ymm4, ymm0, 0 + vpsllvd ymm4, ymm4, ymm9 + vpshufb ymm4, ymm4, ymm10 + vpsllw ymm1, ymm4, 12 + vpsllw ymm2, ymm4, 8 + vpsllw ymm3, ymm4, 4 + vpsraw ymm1, ymm1, 15 + vpsraw ymm2, ymm2, 15 + vpsraw ymm3, ymm3, 15 + vpsraw ymm4, ymm4, 15 + vpand ymm1, ymm1, ymm11 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm11 + vpunpcklqdq ymm5, ymm1, ymm2 + vpunpckhqdq ymm7, ymm1, ymm2 + vpunpcklqdq ymm6, ymm3, ymm4 + vpunpckhqdq ymm8, ymm3, ymm4 + vperm2i128 ymm1, ymm5, ymm6, 32 + vperm2i128 ymm3, ymm5, ymm6, 49 + vperm2i128 ymm2, ymm7, ymm8, 32 + vperm2i128 ymm4, ymm7, ymm8, 49 + vmovdqu YMMWORD PTR [rcx], ymm1 + vmovdqu YMMWORD PTR [rcx+32], ymm2 + vmovdqu YMMWORD PTR [rcx+256], ymm3 + vmovdqu YMMWORD PTR [rcx+288], ymm4 + vpshufd ymm4, ymm0, 85 + vpsllvd ymm4, ymm4, ymm9 + vpshufb ymm4, ymm4, ymm10 + vpsllw ymm1, ymm4, 12 + vpsllw ymm2, ymm4, 8 + vpsllw ymm3, ymm4, 4 + vpsraw ymm1, ymm1, 15 + vpsraw ymm2, ymm2, 15 + vpsraw ymm3, ymm3, 15 + vpsraw ymm4, ymm4, 15 + vpand ymm1, ymm1, ymm11 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm11 + vpunpcklqdq ymm5, ymm1, ymm2 + vpunpckhqdq ymm7, ymm1, ymm2 + vpunpcklqdq ymm6, ymm3, ymm4 + vpunpckhqdq ymm8, ymm3, ymm4 + vperm2i128 ymm1, ymm5, ymm6, 32 + vperm2i128 ymm3, ymm5, ymm6, 49 + vperm2i128 ymm2, ymm7, ymm8, 32 + vperm2i128 ymm4, ymm7, ymm8, 49 + vmovdqu YMMWORD PTR [rcx+64], ymm1 + vmovdqu YMMWORD PTR [rcx+96], ymm2 + vmovdqu YMMWORD PTR [rcx+320], ymm3 + vmovdqu YMMWORD PTR [rcx+352], ymm4 + vpshufd ymm4, ymm0, 170 + vpsllvd ymm4, ymm4, ymm9 + vpshufb ymm4, ymm4, ymm10 + vpsllw ymm1, ymm4, 12 + vpsllw ymm2, ymm4, 8 + vpsllw ymm3, ymm4, 4 + vpsraw ymm1, ymm1, 15 + vpsraw ymm2, ymm2, 15 + vpsraw ymm3, ymm3, 15 + vpsraw ymm4, ymm4, 15 + vpand ymm1, ymm1, ymm11 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm11 + vpunpcklqdq ymm5, ymm1, ymm2 + vpunpckhqdq ymm7, ymm1, ymm2 + vpunpcklqdq ymm6, ymm3, ymm4 + vpunpckhqdq ymm8, ymm3, ymm4 + vperm2i128 ymm1, ymm5, ymm6, 32 + vperm2i128 ymm3, ymm5, ymm6, 49 + vperm2i128 ymm2, ymm7, ymm8, 32 + vperm2i128 ymm4, ymm7, ymm8, 49 + vmovdqu YMMWORD PTR [rcx+128], ymm1 + vmovdqu YMMWORD PTR [rcx+160], ymm2 + vmovdqu YMMWORD PTR [rcx+384], ymm3 + vmovdqu YMMWORD PTR [rcx+416], ymm4 + vpshufd ymm4, ymm0, 255 + vpsllvd ymm4, ymm4, ymm9 + vpshufb ymm4, ymm4, ymm10 + vpsllw ymm1, ymm4, 12 + vpsllw ymm2, ymm4, 8 + vpsllw ymm3, ymm4, 4 + vpsraw ymm1, ymm1, 15 + vpsraw ymm2, ymm2, 15 + vpsraw ymm3, ymm3, 15 + vpsraw ymm4, ymm4, 15 + vpand ymm1, ymm1, ymm11 + vpand ymm2, ymm2, ymm11 + vpand ymm3, ymm3, ymm11 + vpand ymm4, ymm4, ymm11 + vpunpcklqdq ymm5, ymm1, ymm2 + vpunpckhqdq ymm7, ymm1, ymm2 + vpunpcklqdq ymm6, ymm3, ymm4 + vpunpckhqdq ymm8, ymm3, ymm4 + vperm2i128 ymm1, ymm5, ymm6, 32 + vperm2i128 ymm3, ymm5, ymm6, 49 + vperm2i128 ymm2, ymm7, ymm8, 32 + vperm2i128 ymm4, ymm7, ymm8, 49 + vmovdqu YMMWORD PTR [rcx+192], ymm1 + vmovdqu YMMWORD PTR [rcx+224], ymm2 + vmovdqu YMMWORD PTR [rcx+448], ymm3 + vmovdqu YMMWORD PTR [rcx+480], ymm4 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + ret +mlkem_from_msg_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_to_msg_avx2_hqs WORD 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h + WORD 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h, 0680h +ptr_L_mlkem_to_msg_avx2_hqs QWORD L_mlkem_to_msg_avx2_hqs +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_to_msg_avx2_hhqs WORD 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h + WORD 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h, 0fcc1h +ptr_L_mlkem_to_msg_avx2_hhqs QWORD L_mlkem_to_msg_avx2_hhqs +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_to_msg_avx2 PROC + sub rsp, 64 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu ymm8, YMMWORD PTR L_mlkem_to_msg_avx2_hqs + vmovdqu ymm9, YMMWORD PTR L_mlkem_to_msg_avx2_hhqs + vpsubw ymm0, ymm8, [rdx] + vpsubw ymm1, ymm8, [rdx+32] + vpsubw ymm2, ymm8, [rdx+64] + vpsubw ymm3, ymm8, [rdx+96] + vpsraw ymm4, ymm0, 15 + vpsraw ymm5, ymm1, 15 + vpsraw ymm6, ymm2, 15 + vpsraw ymm7, ymm3, 15 + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm5 + vpxor ymm2, ymm2, ymm6 + vpxor ymm3, ymm3, ymm7 + vpaddw ymm0, ymm0, ymm9 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm9 + vpaddw ymm3, ymm3, ymm9 + vpacksswb ymm0, ymm0, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm2, ymm2, 216 + vpmovmskb eax, ymm0 + vpmovmskb r8d, ymm2 + mov DWORD PTR [rcx], eax + mov DWORD PTR [rcx+4], r8d + vpsubw ymm0, ymm8, [rdx+128] + vpsubw ymm1, ymm8, [rdx+160] + vpsubw ymm2, ymm8, [rdx+192] + vpsubw ymm3, ymm8, [rdx+224] + vpsraw ymm4, ymm0, 15 + vpsraw ymm5, ymm1, 15 + vpsraw ymm6, ymm2, 15 + vpsraw ymm7, ymm3, 15 + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm5 + vpxor ymm2, ymm2, ymm6 + vpxor ymm3, ymm3, ymm7 + vpaddw ymm0, ymm0, ymm9 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm9 + vpaddw ymm3, ymm3, ymm9 + vpacksswb ymm0, ymm0, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm2, ymm2, 216 + vpmovmskb eax, ymm0 + vpmovmskb r8d, ymm2 + mov DWORD PTR [rcx+8], eax + mov DWORD PTR [rcx+12], r8d + vpsubw ymm0, ymm8, [rdx+256] + vpsubw ymm1, ymm8, [rdx+288] + vpsubw ymm2, ymm8, [rdx+320] + vpsubw ymm3, ymm8, [rdx+352] + vpsraw ymm4, ymm0, 15 + vpsraw ymm5, ymm1, 15 + vpsraw ymm6, ymm2, 15 + vpsraw ymm7, ymm3, 15 + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm5 + vpxor ymm2, ymm2, ymm6 + vpxor ymm3, ymm3, ymm7 + vpaddw ymm0, ymm0, ymm9 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm9 + vpaddw ymm3, ymm3, ymm9 + vpacksswb ymm0, ymm0, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm2, ymm2, 216 + vpmovmskb eax, ymm0 + vpmovmskb r8d, ymm2 + mov DWORD PTR [rcx+16], eax + mov DWORD PTR [rcx+20], r8d + vpsubw ymm0, ymm8, [rdx+384] + vpsubw ymm1, ymm8, [rdx+416] + vpsubw ymm2, ymm8, [rdx+448] + vpsubw ymm3, ymm8, [rdx+480] + vpsraw ymm4, ymm0, 15 + vpsraw ymm5, ymm1, 15 + vpsraw ymm6, ymm2, 15 + vpsraw ymm7, ymm3, 15 + vpxor ymm0, ymm0, ymm4 + vpxor ymm1, ymm1, ymm5 + vpxor ymm2, ymm2, ymm6 + vpxor ymm3, ymm3, ymm7 + vpaddw ymm0, ymm0, ymm9 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm9 + vpaddw ymm3, ymm3, ymm9 + vpacksswb ymm0, ymm0, ymm1 + vpacksswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 216 + vpermq ymm2, ymm2, 216 + vpmovmskb eax, ymm0 + vpmovmskb r8d, ymm2 + mov DWORD PTR [rcx+24], eax + mov DWORD PTR [rcx+28], r8d + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + add rsp, 64 + ret +mlkem_to_msg_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_from_bytes_avx2_shuf BYTE 00h, 01h, 02h, 0ffh, 03h, 04h, 05h, 0ffh + BYTE 06h, 07h, 08h, 0ffh, 09h, 0ah, 0bh, 0ffh + BYTE 04h, 05h, 06h, 0ffh, 07h, 08h, 09h, 0ffh + BYTE 0ah, 0bh, 0ch, 0ffh, 0dh, 0eh, 0fh, 0ffh +ptr_L_mlkem_from_bytes_avx2_shuf QWORD L_mlkem_from_bytes_avx2_shuf +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_from_bytes_avx2_mask DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh + DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh +ptr_L_mlkem_from_bytes_avx2_mask QWORD L_mlkem_from_bytes_avx2_mask +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_from_bytes_avx2 PROC + sub rsp, 128 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm12, YMMWORD PTR L_mlkem_from_bytes_avx2_shuf + vmovdqu ymm13, YMMWORD PTR L_mlkem_from_bytes_avx2_mask + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [rdx+128] + vmovdqu ymm5, YMMWORD PTR [rdx+160] + vpermq ymm7, ymm5, 233 + vpermq ymm8, ymm5, 0 + vpermq ymm6, ymm4, 62 + vpermq ymm9, ymm4, 64 + vpermq ymm5, ymm3, 3 + vpermq ymm4, ymm3, 148 + vpermq ymm3, ymm2, 233 + vpermq ymm10, ymm2, 0 + vpermq ymm2, ymm1, 62 + vpermq ymm11, ymm1, 64 + vpermq ymm1, ymm0, 3 + vpermq ymm0, ymm0, 148 + vpblendd ymm6, ymm6, ymm8, 192 + vpblendd ymm5, ymm5, ymm9, 252 + vpblendd ymm2, ymm2, ymm10, 192 + vpblendd ymm1, ymm1, ymm11, 252 + vpshufb ymm0, ymm0, ymm12 + vpshufb ymm1, ymm1, ymm12 + vpshufb ymm2, ymm2, ymm12 + vpshufb ymm3, ymm3, ymm12 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpshufb ymm6, ymm6, ymm12 + vpshufb ymm7, ymm7, ymm12 + vpandn ymm8, ymm13, ymm0 + vpandn ymm9, ymm13, ymm1 + vpandn ymm10, ymm13, ymm2 + vpandn ymm11, ymm13, ymm3 + vpand ymm0, ymm13, ymm0 + vpand ymm1, ymm13, ymm1 + vpand ymm2, ymm13, ymm2 + vpand ymm3, ymm13, ymm3 + vpslld ymm8, ymm8, 4 + vpslld ymm9, ymm9, 4 + vpslld ymm10, ymm10, 4 + vpslld ymm11, ymm11, 4 + vpor ymm0, ymm0, ymm8 + vpor ymm1, ymm1, ymm9 + vpor ymm2, ymm2, ymm10 + vpor ymm3, ymm3, ymm11 + vpandn ymm8, ymm13, ymm4 + vpandn ymm9, ymm13, ymm5 + vpandn ymm10, ymm13, ymm6 + vpandn ymm11, ymm13, ymm7 + vpand ymm4, ymm13, ymm4 + vpand ymm5, ymm13, ymm5 + vpand ymm6, ymm13, ymm6 + vpand ymm7, ymm13, ymm7 + vpslld ymm8, ymm8, 4 + vpslld ymm9, ymm9, 4 + vpslld ymm10, ymm10, 4 + vpslld ymm11, ymm11, 4 + vpor ymm4, ymm4, ymm8 + vpor ymm5, ymm5, ymm9 + vpor ymm6, ymm6, ymm10 + vpor ymm7, ymm7, ymm11 + vmovdqu YMMWORD PTR [rcx], ymm0 + vmovdqu YMMWORD PTR [rcx+32], ymm1 + vmovdqu YMMWORD PTR [rcx+64], ymm2 + vmovdqu YMMWORD PTR [rcx+96], ymm3 + vmovdqu YMMWORD PTR [rcx+128], ymm4 + vmovdqu YMMWORD PTR [rcx+160], ymm5 + vmovdqu YMMWORD PTR [rcx+192], ymm6 + vmovdqu YMMWORD PTR [rcx+224], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdx+192] + vmovdqu ymm1, YMMWORD PTR [rdx+224] + vmovdqu ymm2, YMMWORD PTR [rdx+256] + vmovdqu ymm3, YMMWORD PTR [rdx+288] + vmovdqu ymm4, YMMWORD PTR [rdx+320] + vmovdqu ymm5, YMMWORD PTR [rdx+352] + vpermq ymm7, ymm5, 233 + vpermq ymm8, ymm5, 0 + vpermq ymm6, ymm4, 62 + vpermq ymm9, ymm4, 64 + vpermq ymm5, ymm3, 3 + vpermq ymm4, ymm3, 148 + vpermq ymm3, ymm2, 233 + vpermq ymm10, ymm2, 0 + vpermq ymm2, ymm1, 62 + vpermq ymm11, ymm1, 64 + vpermq ymm1, ymm0, 3 + vpermq ymm0, ymm0, 148 + vpblendd ymm6, ymm6, ymm8, 192 + vpblendd ymm5, ymm5, ymm9, 252 + vpblendd ymm2, ymm2, ymm10, 192 + vpblendd ymm1, ymm1, ymm11, 252 + vpshufb ymm0, ymm0, ymm12 + vpshufb ymm1, ymm1, ymm12 + vpshufb ymm2, ymm2, ymm12 + vpshufb ymm3, ymm3, ymm12 + vpshufb ymm4, ymm4, ymm12 + vpshufb ymm5, ymm5, ymm12 + vpshufb ymm6, ymm6, ymm12 + vpshufb ymm7, ymm7, ymm12 + vpandn ymm8, ymm13, ymm0 + vpandn ymm9, ymm13, ymm1 + vpandn ymm10, ymm13, ymm2 + vpandn ymm11, ymm13, ymm3 + vpand ymm0, ymm13, ymm0 + vpand ymm1, ymm13, ymm1 + vpand ymm2, ymm13, ymm2 + vpand ymm3, ymm13, ymm3 + vpslld ymm8, ymm8, 4 + vpslld ymm9, ymm9, 4 + vpslld ymm10, ymm10, 4 + vpslld ymm11, ymm11, 4 + vpor ymm0, ymm0, ymm8 + vpor ymm1, ymm1, ymm9 + vpor ymm2, ymm2, ymm10 + vpor ymm3, ymm3, ymm11 + vpandn ymm8, ymm13, ymm4 + vpandn ymm9, ymm13, ymm5 + vpandn ymm10, ymm13, ymm6 + vpandn ymm11, ymm13, ymm7 + vpand ymm4, ymm13, ymm4 + vpand ymm5, ymm13, ymm5 + vpand ymm6, ymm13, ymm6 + vpand ymm7, ymm13, ymm7 + vpslld ymm8, ymm8, 4 + vpslld ymm9, ymm9, 4 + vpslld ymm10, ymm10, 4 + vpslld ymm11, ymm11, 4 + vpor ymm4, ymm4, ymm8 + vpor ymm5, ymm5, ymm9 + vpor ymm6, ymm6, ymm10 + vpor ymm7, ymm7, ymm11 + vmovdqu YMMWORD PTR [rcx+256], ymm0 + vmovdqu YMMWORD PTR [rcx+288], ymm1 + vmovdqu YMMWORD PTR [rcx+320], ymm2 + vmovdqu YMMWORD PTR [rcx+352], ymm3 + vmovdqu YMMWORD PTR [rcx+384], ymm4 + vmovdqu YMMWORD PTR [rcx+416], ymm5 + vmovdqu YMMWORD PTR [rcx+448], ymm6 + vmovdqu YMMWORD PTR [rcx+480], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + add rsp, 128 + ret +mlkem_from_bytes_avx2 ENDP +_TEXT ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_to_bytes_avx2_mask DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh + DWORD 00000fffh, 00000fffh, 00000fffh, 00000fffh +ptr_L_mlkem_to_bytes_avx2_mask QWORD L_mlkem_to_bytes_avx2_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_to_bytes_avx2_shuf BYTE 00h, 01h, 02h, 04h, 05h, 06h, 08h, 09h + BYTE 0ah, 0ch, 0dh, 0eh, 0ffh, 0ffh, 0ffh, 0ffh + BYTE 05h, 06h, 08h, 09h, 0ah, 0ch, 0dh, 0eh + BYTE 0ffh, 0ffh, 0ffh, 0ffh, 00h, 01h, 02h, 04h +ptr_L_mlkem_to_bytes_avx2_shuf QWORD L_mlkem_to_bytes_avx2_shuf +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_mlkem_to_bytes_avx2_perm DWORD 00000000h, 00000001h, 00000002h, 00000007h + DWORD 00000004h, 00000005h, 00000003h, 00000006h +ptr_L_mlkem_to_bytes_avx2_perm QWORD L_mlkem_to_bytes_avx2_perm +_DATA ENDS +_TEXT SEGMENT READONLY PARA +mlkem_to_bytes_avx2 PROC + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm12, YMMWORD PTR mlkem_q + vmovdqu ymm13, YMMWORD PTR L_mlkem_to_bytes_avx2_mask + vmovdqu ymm14, YMMWORD PTR L_mlkem_to_bytes_avx2_shuf + vmovdqu ymm15, YMMWORD PTR L_mlkem_to_bytes_avx2_perm + vmovdqu ymm0, YMMWORD PTR [rdx] + vmovdqu ymm1, YMMWORD PTR [rdx+32] + vmovdqu ymm2, YMMWORD PTR [rdx+64] + vmovdqu ymm3, YMMWORD PTR [rdx+96] + vmovdqu ymm4, YMMWORD PTR [rdx+128] + vmovdqu ymm5, YMMWORD PTR [rdx+160] + vmovdqu ymm6, YMMWORD PTR [rdx+192] + vmovdqu ymm7, YMMWORD PTR [rdx+224] + vpsubw ymm8, ymm0, ymm12 + vpsubw ymm9, ymm1, ymm12 + vpsubw ymm10, ymm2, ymm12 + vpsubw ymm11, ymm3, ymm12 + vpsraw ymm0, ymm8, 15 + vpsraw ymm1, ymm9, 15 + vpsraw ymm2, ymm10, 15 + vpsraw ymm3, ymm11, 15 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm10 + vpaddw ymm3, ymm3, ymm11 + vpsubw ymm8, ymm4, ymm12 + vpsubw ymm9, ymm5, ymm12 + vpsubw ymm10, ymm6, ymm12 + vpsubw ymm11, ymm7, ymm12 + vpsraw ymm4, ymm8, 15 + vpsraw ymm5, ymm9, 15 + vpsraw ymm6, ymm10, 15 + vpsraw ymm7, ymm11, 15 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vpand ymm6, ymm6, ymm12 + vpand ymm7, ymm7, ymm12 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + vpaddw ymm6, ymm6, ymm10 + vpaddw ymm7, ymm7, ymm11 + vpsrld ymm8, ymm0, 16 + vpsrld ymm9, ymm1, 16 + vpsrld ymm10, ymm2, 16 + vpsrld ymm11, ymm3, 16 + vpand ymm0, ymm13, ymm0 + vpand ymm1, ymm13, ymm1 + vpand ymm2, ymm13, ymm2 + vpand ymm3, ymm13, ymm3 + vpslld ymm8, ymm8, 12 + vpslld ymm9, ymm9, 12 + vpslld ymm10, ymm10, 12 + vpslld ymm11, ymm11, 12 + vpor ymm0, ymm0, ymm8 + vpor ymm1, ymm1, ymm9 + vpor ymm2, ymm2, ymm10 + vpor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm4, 16 + vpsrld ymm9, ymm5, 16 + vpsrld ymm10, ymm6, 16 + vpsrld ymm11, ymm7, 16 + vpand ymm4, ymm13, ymm4 + vpand ymm5, ymm13, ymm5 + vpand ymm6, ymm13, ymm6 + vpand ymm7, ymm13, ymm7 + vpslld ymm8, ymm8, 12 + vpslld ymm9, ymm9, 12 + vpslld ymm10, ymm10, 12 + vpslld ymm11, ymm11, 12 + vpor ymm4, ymm4, ymm8 + vpor ymm5, ymm5, ymm9 + vpor ymm6, ymm6, ymm10 + vpor ymm7, ymm7, ymm11 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm1, ymm1, ymm14 + vpshufb ymm2, ymm2, ymm14 + vpshufb ymm3, ymm3, ymm14 + vpshufb ymm4, ymm4, ymm14 + vpshufb ymm5, ymm5, ymm14 + vpshufb ymm6, ymm6, ymm14 + vpshufb ymm7, ymm7, ymm14 + vpermd ymm0, ymm15, ymm0 + vpermd ymm1, ymm15, ymm1 + vpermd ymm2, ymm15, ymm2 + vpermd ymm3, ymm15, ymm3 + vpermd ymm4, ymm15, ymm4 + vpermd ymm5, ymm15, ymm5 + vpermd ymm6, ymm15, ymm6 + vpermd ymm7, ymm15, ymm7 + vpermq ymm8, ymm6, 2 + vpermq ymm7, ymm7, 144 + vpermq ymm9, ymm5, 9 + vpermq ymm6, ymm6, 64 + vpermq ymm5, ymm5, 0 + vpblendd ymm5, ymm5, ymm4, 63 + vpermq ymm10, ymm2, 2 + vpermq ymm4, ymm3, 144 + vpermq ymm11, ymm1, 9 + vpermq ymm3, ymm2, 64 + vpermq ymm2, ymm1, 0 + vpblendd ymm2, ymm2, ymm0, 63 + vpblendd ymm7, ymm7, ymm8, 3 + vpblendd ymm6, ymm6, ymm9, 15 + vpblendd ymm4, ymm4, ymm10, 3 + vpblendd ymm3, ymm3, ymm11, 15 + vmovdqu YMMWORD PTR [rcx], ymm2 + vmovdqu YMMWORD PTR [rcx+32], ymm3 + vmovdqu YMMWORD PTR [rcx+64], ymm4 + vmovdqu YMMWORD PTR [rcx+96], ymm5 + vmovdqu YMMWORD PTR [rcx+128], ymm6 + vmovdqu YMMWORD PTR [rcx+160], ymm7 + vmovdqu ymm0, YMMWORD PTR [rdx+256] + vmovdqu ymm1, YMMWORD PTR [rdx+288] + vmovdqu ymm2, YMMWORD PTR [rdx+320] + vmovdqu ymm3, YMMWORD PTR [rdx+352] + vmovdqu ymm4, YMMWORD PTR [rdx+384] + vmovdqu ymm5, YMMWORD PTR [rdx+416] + vmovdqu ymm6, YMMWORD PTR [rdx+448] + vmovdqu ymm7, YMMWORD PTR [rdx+480] + vpsubw ymm8, ymm0, ymm12 + vpsubw ymm9, ymm1, ymm12 + vpsubw ymm10, ymm2, ymm12 + vpsubw ymm11, ymm3, ymm12 + vpsraw ymm0, ymm8, 15 + vpsraw ymm1, ymm9, 15 + vpsraw ymm2, ymm10, 15 + vpsraw ymm3, ymm11, 15 + vpand ymm0, ymm0, ymm12 + vpand ymm1, ymm1, ymm12 + vpand ymm2, ymm2, ymm12 + vpand ymm3, ymm3, ymm12 + vpaddw ymm0, ymm0, ymm8 + vpaddw ymm1, ymm1, ymm9 + vpaddw ymm2, ymm2, ymm10 + vpaddw ymm3, ymm3, ymm11 + vpsubw ymm8, ymm4, ymm12 + vpsubw ymm9, ymm5, ymm12 + vpsubw ymm10, ymm6, ymm12 + vpsubw ymm11, ymm7, ymm12 + vpsraw ymm4, ymm8, 15 + vpsraw ymm5, ymm9, 15 + vpsraw ymm6, ymm10, 15 + vpsraw ymm7, ymm11, 15 + vpand ymm4, ymm4, ymm12 + vpand ymm5, ymm5, ymm12 + vpand ymm6, ymm6, ymm12 + vpand ymm7, ymm7, ymm12 + vpaddw ymm4, ymm4, ymm8 + vpaddw ymm5, ymm5, ymm9 + vpaddw ymm6, ymm6, ymm10 + vpaddw ymm7, ymm7, ymm11 + vpsrld ymm8, ymm0, 16 + vpsrld ymm9, ymm1, 16 + vpsrld ymm10, ymm2, 16 + vpsrld ymm11, ymm3, 16 + vpand ymm0, ymm13, ymm0 + vpand ymm1, ymm13, ymm1 + vpand ymm2, ymm13, ymm2 + vpand ymm3, ymm13, ymm3 + vpslld ymm8, ymm8, 12 + vpslld ymm9, ymm9, 12 + vpslld ymm10, ymm10, 12 + vpslld ymm11, ymm11, 12 + vpor ymm0, ymm0, ymm8 + vpor ymm1, ymm1, ymm9 + vpor ymm2, ymm2, ymm10 + vpor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm4, 16 + vpsrld ymm9, ymm5, 16 + vpsrld ymm10, ymm6, 16 + vpsrld ymm11, ymm7, 16 + vpand ymm4, ymm13, ymm4 + vpand ymm5, ymm13, ymm5 + vpand ymm6, ymm13, ymm6 + vpand ymm7, ymm13, ymm7 + vpslld ymm8, ymm8, 12 + vpslld ymm9, ymm9, 12 + vpslld ymm10, ymm10, 12 + vpslld ymm11, ymm11, 12 + vpor ymm4, ymm4, ymm8 + vpor ymm5, ymm5, ymm9 + vpor ymm6, ymm6, ymm10 + vpor ymm7, ymm7, ymm11 + vpshufb ymm0, ymm0, ymm14 + vpshufb ymm1, ymm1, ymm14 + vpshufb ymm2, ymm2, ymm14 + vpshufb ymm3, ymm3, ymm14 + vpshufb ymm4, ymm4, ymm14 + vpshufb ymm5, ymm5, ymm14 + vpshufb ymm6, ymm6, ymm14 + vpshufb ymm7, ymm7, ymm14 + vpermd ymm0, ymm15, ymm0 + vpermd ymm1, ymm15, ymm1 + vpermd ymm2, ymm15, ymm2 + vpermd ymm3, ymm15, ymm3 + vpermd ymm4, ymm15, ymm4 + vpermd ymm5, ymm15, ymm5 + vpermd ymm6, ymm15, ymm6 + vpermd ymm7, ymm15, ymm7 + vpermq ymm8, ymm6, 2 + vpermq ymm7, ymm7, 144 + vpermq ymm9, ymm5, 9 + vpermq ymm6, ymm6, 64 + vpermq ymm5, ymm5, 0 + vpblendd ymm5, ymm5, ymm4, 63 + vpermq ymm10, ymm2, 2 + vpermq ymm4, ymm3, 144 + vpermq ymm11, ymm1, 9 + vpermq ymm3, ymm2, 64 + vpermq ymm2, ymm1, 0 + vpblendd ymm2, ymm2, ymm0, 63 + vpblendd ymm7, ymm7, ymm8, 3 + vpblendd ymm6, ymm6, ymm9, 15 + vpblendd ymm4, ymm4, ymm10, 3 + vpblendd ymm3, ymm3, ymm11, 15 + vmovdqu YMMWORD PTR [rcx+192], ymm2 + vmovdqu YMMWORD PTR [rcx+224], ymm3 + vmovdqu YMMWORD PTR [rcx+256], ymm4 + vmovdqu YMMWORD PTR [rcx+288], ymm5 + vmovdqu YMMWORD PTR [rcx+320], ymm6 + vmovdqu YMMWORD PTR [rcx+352], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + ret +mlkem_to_bytes_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_cmp_avx2 PROC + vpxor ymm2, ymm2, ymm2 + vpxor ymm3, ymm3, ymm3 + mov r9d, 0 + mov r10d, -1 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vpxor ymm0, ymm0, [rdx] + vpxor ymm1, ymm1, [rdx+32] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+64] + vmovdqu ymm1, YMMWORD PTR [rcx+96] + vpxor ymm0, ymm0, [rdx+64] + vpxor ymm1, ymm1, [rdx+96] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+128] + vmovdqu ymm1, YMMWORD PTR [rcx+160] + vpxor ymm0, ymm0, [rdx+128] + vpxor ymm1, ymm1, [rdx+160] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+192] + vmovdqu ymm1, YMMWORD PTR [rcx+224] + vpxor ymm0, ymm0, [rdx+192] + vpxor ymm1, ymm1, [rdx+224] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vpxor ymm0, ymm0, [rdx+256] + vpxor ymm1, ymm1, [rdx+288] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+320] + vmovdqu ymm1, YMMWORD PTR [rcx+352] + vpxor ymm0, ymm0, [rdx+320] + vpxor ymm1, ymm1, [rdx+352] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vpxor ymm0, ymm0, [rdx+384] + vpxor ymm1, ymm1, [rdx+416] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+448] + vmovdqu ymm1, YMMWORD PTR [rcx+480] + vpxor ymm0, ymm0, [rdx+448] + vpxor ymm1, ymm1, [rdx+480] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+512] + vmovdqu ymm1, YMMWORD PTR [rcx+544] + vpxor ymm0, ymm0, [rdx+512] + vpxor ymm1, ymm1, [rdx+544] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+576] + vmovdqu ymm1, YMMWORD PTR [rcx+608] + vpxor ymm0, ymm0, [rdx+576] + vpxor ymm1, ymm1, [rdx+608] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+640] + vmovdqu ymm1, YMMWORD PTR [rcx+672] + vpxor ymm0, ymm0, [rdx+640] + vpxor ymm1, ymm1, [rdx+672] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+704] + vmovdqu ymm1, YMMWORD PTR [rcx+736] + vpxor ymm0, ymm0, [rdx+704] + vpxor ymm1, ymm1, [rdx+736] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + sub r8d, 768 + jz L_mlkem_cmp_avx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+768] + vmovdqu ymm1, YMMWORD PTR [rcx+800] + vpxor ymm0, ymm0, [rdx+768] + vpxor ymm1, ymm1, [rdx+800] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+832] + vmovdqu ymm1, YMMWORD PTR [rcx+864] + vpxor ymm0, ymm0, [rdx+832] + vpxor ymm1, ymm1, [rdx+864] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+896] + vmovdqu ymm1, YMMWORD PTR [rcx+928] + vpxor ymm0, ymm0, [rdx+896] + vpxor ymm1, ymm1, [rdx+928] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+960] + vmovdqu ymm1, YMMWORD PTR [rcx+992] + vpxor ymm0, ymm0, [rdx+960] + vpxor ymm1, ymm1, [rdx+992] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1024] + vmovdqu ymm1, YMMWORD PTR [rcx+1056] + vpxor ymm0, ymm0, [rdx+1024] + vpxor ymm1, ymm1, [rdx+1056] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + sub r8d, 320 + jz L_mlkem_cmp_avx2_done + vmovdqu ymm0, YMMWORD PTR [rcx+1088] + vmovdqu ymm1, YMMWORD PTR [rcx+1120] + vpxor ymm0, ymm0, [rdx+1088] + vpxor ymm1, ymm1, [rdx+1120] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1152] + vmovdqu ymm1, YMMWORD PTR [rcx+1184] + vpxor ymm0, ymm0, [rdx+1152] + vpxor ymm1, ymm1, [rdx+1184] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1216] + vmovdqu ymm1, YMMWORD PTR [rcx+1248] + vpxor ymm0, ymm0, [rdx+1216] + vpxor ymm1, ymm1, [rdx+1248] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1280] + vmovdqu ymm1, YMMWORD PTR [rcx+1312] + vpxor ymm0, ymm0, [rdx+1280] + vpxor ymm1, ymm1, [rdx+1312] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1344] + vmovdqu ymm1, YMMWORD PTR [rcx+1376] + vpxor ymm0, ymm0, [rdx+1344] + vpxor ymm1, ymm1, [rdx+1376] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1408] + vmovdqu ymm1, YMMWORD PTR [rcx+1440] + vpxor ymm0, ymm0, [rdx+1408] + vpxor ymm1, ymm1, [rdx+1440] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1472] + vmovdqu ymm1, YMMWORD PTR [rcx+1504] + vpxor ymm0, ymm0, [rdx+1472] + vpxor ymm1, ymm1, [rdx+1504] + vpor ymm2, ymm2, ymm0 + vpor ymm3, ymm3, ymm1 + vmovdqu ymm0, YMMWORD PTR [rcx+1536] + vpxor ymm0, ymm0, [rdx+1536] + vpor ymm2, ymm2, ymm0 +L_mlkem_cmp_avx2_done: + vpor ymm2, ymm2, ymm3 + vptest ymm2, ymm2 + cmovz eax, r9d + cmovnz eax, r10d + vzeroupper + ret +mlkem_cmp_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_redistribute_21_rand_avx2 PROC + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + sub rsp, 160 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu OWORD PTR [rsp+96], xmm12 + vmovdqu OWORD PTR [rsp+112], xmm13 + vmovdqu OWORD PTR [rsp+128], xmm14 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vmovdqu ymm8, YMMWORD PTR [rcx+256] + vmovdqu ymm9, YMMWORD PTR [rcx+288] + vmovdqu ymm10, YMMWORD PTR [rcx+320] + vmovdqu ymm11, YMMWORD PTR [rcx+352] + vpunpcklqdq ymm12, ymm0, ymm1 + vpunpckhqdq ymm13, ymm0, ymm1 + vpunpcklqdq ymm14, ymm2, ymm3 + vpunpckhqdq ymm15, ymm2, ymm3 + vperm2i128 ymm0, ymm12, ymm14, 32 + vperm2i128 ymm1, ymm13, ymm15, 32 + vperm2i128 ymm2, ymm12, ymm14, 49 + vperm2i128 ymm3, ymm13, ymm15, 49 + vpunpcklqdq ymm12, ymm4, ymm5 + vpunpckhqdq ymm13, ymm4, ymm5 + vpunpcklqdq ymm14, ymm6, ymm7 + vpunpckhqdq ymm15, ymm6, ymm7 + vperm2i128 ymm4, ymm12, ymm14, 32 + vperm2i128 ymm5, ymm13, ymm15, 32 + vperm2i128 ymm6, ymm12, ymm14, 49 + vperm2i128 ymm7, ymm13, ymm15, 49 + vpunpcklqdq ymm12, ymm8, ymm9 + vpunpckhqdq ymm13, ymm8, ymm9 + vpunpcklqdq ymm14, ymm10, ymm11 + vpunpckhqdq ymm15, ymm10, ymm11 + vperm2i128 ymm8, ymm12, ymm14, 32 + vperm2i128 ymm9, ymm13, ymm15, 32 + vperm2i128 ymm10, ymm12, ymm14, 49 + vperm2i128 ymm11, ymm13, ymm15, 49 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + vmovdqu YMMWORD PTR [rdx+64], ymm8 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm5 + vmovdqu YMMWORD PTR [r8+64], ymm9 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [r9+64], ymm10 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm7 + vmovdqu YMMWORD PTR [rax+64], ymm11 + vmovdqu ymm0, YMMWORD PTR [rcx+384] + vmovdqu ymm1, YMMWORD PTR [rcx+416] + vmovdqu ymm2, YMMWORD PTR [rcx+448] + vmovdqu ymm3, YMMWORD PTR [rcx+480] + vmovdqu ymm4, YMMWORD PTR [rcx+512] + vmovdqu ymm5, YMMWORD PTR [rcx+544] + vmovdqu ymm6, YMMWORD PTR [rcx+576] + vmovdqu ymm7, YMMWORD PTR [rcx+608] + mov r10, QWORD PTR [rcx+640] + mov r11, QWORD PTR [rcx+648] + mov r12, QWORD PTR [rcx+656] + mov r13, QWORD PTR [rcx+664] + vpunpcklqdq ymm12, ymm0, ymm1 + vpunpckhqdq ymm13, ymm0, ymm1 + vpunpcklqdq ymm14, ymm2, ymm3 + vpunpckhqdq ymm15, ymm2, ymm3 + vperm2i128 ymm0, ymm12, ymm14, 32 + vperm2i128 ymm1, ymm13, ymm15, 32 + vperm2i128 ymm2, ymm12, ymm14, 49 + vperm2i128 ymm3, ymm13, ymm15, 49 + vpunpcklqdq ymm12, ymm4, ymm5 + vpunpckhqdq ymm13, ymm4, ymm5 + vpunpcklqdq ymm14, ymm6, ymm7 + vpunpckhqdq ymm15, ymm6, ymm7 + vperm2i128 ymm4, ymm12, ymm14, 32 + vperm2i128 ymm5, ymm13, ymm15, 32 + vperm2i128 ymm6, ymm12, ymm14, 49 + vperm2i128 ymm7, ymm13, ymm15, 49 + vmovdqu YMMWORD PTR [rdx+96], ymm0 + vmovdqu YMMWORD PTR [rdx+128], ymm4 + mov QWORD PTR [rdx+160], r10 + vmovdqu YMMWORD PTR [r8+96], ymm1 + vmovdqu YMMWORD PTR [r8+128], ymm5 + mov QWORD PTR [r8+160], r11 + vmovdqu YMMWORD PTR [r9+96], ymm2 + vmovdqu YMMWORD PTR [r9+128], ymm6 + mov QWORD PTR [r9+160], r12 + vmovdqu YMMWORD PTR [rax+96], ymm3 + vmovdqu YMMWORD PTR [rax+128], ymm7 + mov QWORD PTR [rax+160], r13 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + vmovdqu xmm12, OWORD PTR [rsp+96] + vmovdqu xmm13, OWORD PTR [rsp+112] + vmovdqu xmm14, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] + add rsp, 160 + pop r13 + pop r12 + ret +mlkem_redistribute_21_rand_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_redistribute_17_rand_avx2 PROC + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm9, ymm0, ymm1 + vpunpcklqdq ymm10, ymm2, ymm3 + vpunpckhqdq ymm11, ymm2, ymm3 + vperm2i128 ymm0, ymm8, ymm10, 32 + vperm2i128 ymm1, ymm9, ymm11, 32 + vperm2i128 ymm2, ymm8, ymm10, 49 + vperm2i128 ymm3, ymm9, ymm11, 49 + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm9, ymm4, ymm5 + vpunpcklqdq ymm10, ymm6, ymm7 + vpunpckhqdq ymm11, ymm6, ymm7 + vperm2i128 ymm4, ymm8, ymm10, 32 + vperm2i128 ymm5, ymm9, ymm11, 32 + vperm2i128 ymm6, ymm8, ymm10, 49 + vperm2i128 ymm7, ymm9, ymm11, 49 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm5 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + mov r10, QWORD PTR [rcx+512] + mov r11, QWORD PTR [rcx+520] + mov r12, QWORD PTR [rcx+528] + mov r13, QWORD PTR [rcx+536] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm9, ymm0, ymm1 + vpunpcklqdq ymm10, ymm2, ymm3 + vpunpckhqdq ymm11, ymm2, ymm3 + vperm2i128 ymm0, ymm8, ymm10, 32 + vperm2i128 ymm1, ymm9, ymm11, 32 + vperm2i128 ymm2, ymm8, ymm10, 49 + vperm2i128 ymm3, ymm9, ymm11, 49 + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm9, ymm4, ymm5 + vpunpcklqdq ymm10, ymm6, ymm7 + vpunpckhqdq ymm11, ymm6, ymm7 + vperm2i128 ymm4, ymm8, ymm10, 32 + vperm2i128 ymm5, ymm9, ymm11, 32 + vperm2i128 ymm6, ymm8, ymm10, 49 + vperm2i128 ymm7, ymm9, ymm11, 49 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm4 + mov QWORD PTR [rdx+128], r10 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm5 + mov QWORD PTR [r8+128], r11 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm6 + mov QWORD PTR [r9+128], r12 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm7 + mov QWORD PTR [rax+128], r13 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop r13 + pop r12 + ret +mlkem_redistribute_17_rand_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_redistribute_16_rand_avx2 PROC + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm9, ymm0, ymm1 + vpunpcklqdq ymm10, ymm2, ymm3 + vpunpckhqdq ymm11, ymm2, ymm3 + vperm2i128 ymm0, ymm8, ymm10, 32 + vperm2i128 ymm1, ymm9, ymm11, 32 + vperm2i128 ymm2, ymm8, ymm10, 49 + vperm2i128 ymm3, ymm9, ymm11, 49 + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm9, ymm4, ymm5 + vpunpcklqdq ymm10, ymm6, ymm7 + vpunpckhqdq ymm11, ymm6, ymm7 + vperm2i128 ymm4, ymm8, ymm10, 32 + vperm2i128 ymm5, ymm9, ymm11, 32 + vperm2i128 ymm6, ymm8, ymm10, 49 + vperm2i128 ymm7, ymm9, ymm11, 49 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm5 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm7 + vmovdqu ymm0, YMMWORD PTR [rcx+256] + vmovdqu ymm1, YMMWORD PTR [rcx+288] + vmovdqu ymm2, YMMWORD PTR [rcx+320] + vmovdqu ymm3, YMMWORD PTR [rcx+352] + vmovdqu ymm4, YMMWORD PTR [rcx+384] + vmovdqu ymm5, YMMWORD PTR [rcx+416] + vmovdqu ymm6, YMMWORD PTR [rcx+448] + vmovdqu ymm7, YMMWORD PTR [rcx+480] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm9, ymm0, ymm1 + vpunpcklqdq ymm10, ymm2, ymm3 + vpunpckhqdq ymm11, ymm2, ymm3 + vperm2i128 ymm0, ymm8, ymm10, 32 + vperm2i128 ymm1, ymm9, ymm11, 32 + vperm2i128 ymm2, ymm8, ymm10, 49 + vperm2i128 ymm3, ymm9, ymm11, 49 + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm9, ymm4, ymm5 + vpunpcklqdq ymm10, ymm6, ymm7 + vpunpckhqdq ymm11, ymm6, ymm7 + vperm2i128 ymm4, ymm8, ymm10, 32 + vperm2i128 ymm5, ymm9, ymm11, 32 + vperm2i128 ymm6, ymm8, ymm10, 49 + vperm2i128 ymm7, ymm9, ymm11, 49 + vmovdqu YMMWORD PTR [rdx+64], ymm0 + vmovdqu YMMWORD PTR [rdx+96], ymm4 + vmovdqu YMMWORD PTR [r8+64], ymm1 + vmovdqu YMMWORD PTR [r8+96], ymm5 + vmovdqu YMMWORD PTR [r9+64], ymm2 + vmovdqu YMMWORD PTR [r9+96], ymm6 + vmovdqu YMMWORD PTR [rax+64], ymm3 + vmovdqu YMMWORD PTR [rax+96], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop r13 + pop r12 + ret +mlkem_redistribute_16_rand_avx2 ENDP +_TEXT ENDS +_TEXT SEGMENT READONLY PARA +mlkem_redistribute_8_rand_avx2 PROC + push r12 + push r13 + mov rax, QWORD PTR [rsp+56] + sub rsp, 96 + vmovdqu OWORD PTR [rsp], xmm6 + vmovdqu OWORD PTR [rsp+16], xmm7 + vmovdqu OWORD PTR [rsp+32], xmm8 + vmovdqu OWORD PTR [rsp+48], xmm9 + vmovdqu OWORD PTR [rsp+64], xmm10 + vmovdqu OWORD PTR [rsp+80], xmm11 + vmovdqu ymm0, YMMWORD PTR [rcx] + vmovdqu ymm1, YMMWORD PTR [rcx+32] + vmovdqu ymm2, YMMWORD PTR [rcx+64] + vmovdqu ymm3, YMMWORD PTR [rcx+96] + vmovdqu ymm4, YMMWORD PTR [rcx+128] + vmovdqu ymm5, YMMWORD PTR [rcx+160] + vmovdqu ymm6, YMMWORD PTR [rcx+192] + vmovdqu ymm7, YMMWORD PTR [rcx+224] + vpunpcklqdq ymm8, ymm0, ymm1 + vpunpckhqdq ymm9, ymm0, ymm1 + vpunpcklqdq ymm10, ymm2, ymm3 + vpunpckhqdq ymm11, ymm2, ymm3 + vperm2i128 ymm0, ymm8, ymm10, 32 + vperm2i128 ymm1, ymm9, ymm11, 32 + vperm2i128 ymm2, ymm8, ymm10, 49 + vperm2i128 ymm3, ymm9, ymm11, 49 + vpunpcklqdq ymm8, ymm4, ymm5 + vpunpckhqdq ymm9, ymm4, ymm5 + vpunpcklqdq ymm10, ymm6, ymm7 + vpunpckhqdq ymm11, ymm6, ymm7 + vperm2i128 ymm4, ymm8, ymm10, 32 + vperm2i128 ymm5, ymm9, ymm11, 32 + vperm2i128 ymm6, ymm8, ymm10, 49 + vperm2i128 ymm7, ymm9, ymm11, 49 + vmovdqu YMMWORD PTR [rdx], ymm0 + vmovdqu YMMWORD PTR [rdx+32], ymm4 + vmovdqu YMMWORD PTR [r8], ymm1 + vmovdqu YMMWORD PTR [r8+32], ymm5 + vmovdqu YMMWORD PTR [r9], ymm2 + vmovdqu YMMWORD PTR [r9+32], ymm6 + vmovdqu YMMWORD PTR [rax], ymm3 + vmovdqu YMMWORD PTR [rax+32], ymm7 + vzeroupper + vmovdqu xmm6, OWORD PTR [rsp] + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm8, OWORD PTR [rsp+32] + vmovdqu xmm9, OWORD PTR [rsp+48] + vmovdqu xmm10, OWORD PTR [rsp+64] + vmovdqu xmm11, OWORD PTR [rsp+80] + add rsp, 96 + pop r13 + pop r12 + ret +mlkem_redistribute_8_rand_avx2 ENDP +_TEXT ENDS +ENDIF +ENDIF +END diff --git a/wolfssl-VS2022.vcxproj b/wolfssl-VS2022.vcxproj index 81d32758e91..f15f6e9a598 100644 --- a/wolfssl-VS2022.vcxproj +++ b/wolfssl-VS2022.vcxproj @@ -57,6 +57,13 @@ wolfssl + + + false + StaticLibrary v143 @@ -517,6 +524,28 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + true + true + true + true + true + true + true + true + false false @@ -573,6 +602,42 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + @@ -585,6 +650,11 @@ true + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/wolfssl.vcxproj b/wolfssl.vcxproj index 44c23ab74ee..e00b6e6122a 100644 --- a/wolfssl.vcxproj +++ b/wolfssl.vcxproj @@ -56,6 +56,13 @@ Win32Proj + + + false + StaticLibrary v110 @@ -517,6 +524,28 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + true + true + true + true + true + true + true + true + false false @@ -573,6 +602,42 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + @@ -585,6 +650,11 @@ true + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + + diff --git a/wrapper/CSharp/wolfssl.vcxproj b/wrapper/CSharp/wolfssl.vcxproj index 7a963cbd913..396ff1c1dce 100644 --- a/wrapper/CSharp/wolfssl.vcxproj +++ b/wrapper/CSharp/wolfssl.vcxproj @@ -40,6 +40,13 @@ Win32Proj + + + false + StaticLibrary v143 @@ -399,6 +406,24 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml.exe /c /safeseh /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml.exe /c /safeseh /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + true + true + true + true + false false @@ -455,6 +480,42 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + + + true + false + ml64.exe /c /Zi /DWOLFSSL_X86_64_BUILD /DWOLFSSL_HAVE_MLKEM /DWOLFSSL_HAVE_MLDSA /DWOLFSSL_HAVE_SLHDSA /DHAVE_ED25519 /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + @@ -465,6 +526,11 @@ true + + + USE_INTEL_SPEEDUP;WOLFSSL_X86_64_BUILD;%(PreprocessorDefinitions) + +